Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# base nvidia CUDA image
FROM nvidia/cuda:11.8.0-devel-ubuntu20.04

# initialize directories
ARG HOME_DIR=/workspace
ARG LOG_DIR=${HOME_DIR}/logs
ARG TMP_DIR=${HOME_DIR}/tmp
RUN mkdir ${HOME_DIR}
RUN mkdir ${LOG_DIR}
RUN mkdir ${TMP_DIR}

WORKDIR ${HOME_DIR}
COPY . .

# install important dependencies
ARG DEBIAN_FRONTEND=noninteractive
RUN apt update && apt -y upgrade && \
apt install software-properties-common -y && \
add-apt-repository ppa:deadsnakes/ppa -y && \
apt update && apt install python3.8 -y && \
apt install -y python-is-python3 && apt install -y python3-pip

RUN apt-get update && apt-get install -y build-essential \
curl \
vim \
libcudnn8

RUN pip install -r requirements.txt

# so jax can detect GPUs
RUN pip install -U jaxlib==0.1.60+cuda111 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html

# symlink
RUN ln -s /usr/local/cuda/lib64/libcusolver.so.11 /usr/local/cuda/lib64/libcusolver.so.10
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64"

# CMD ["python]
3 changes: 2 additions & 1 deletion configs/default_cifar10_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ def get_default_configs():
config = ml_collections.ConfigDict()
# training
config.training = training = ml_collections.ConfigDict()
config.training.batch_size = 128
# config.training.batch_size = 128
config.training.batch_size = 2
training.n_iters = 1300001
training.snapshot_freq = 50000
training.log_freq = 50
Expand Down
52 changes: 52 additions & 0 deletions docker.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Mr Yang's `score_sde`, dockerized

## OS Prereqs

### Install GPU drivers

1. Ensure packages are up to date & install

`sudo apt update && sudo apt -y upgrade && sudo apt-get -y install ubuntu-drivers-common && sudo ubuntu-drivers autoinstall && sudo apt install nvidia-driver-515`

Depending on the card, you'll want to install the most up-to-date driver

- `t4`: 515
- `v100`: 520

To check:

- `ubuntu-drivers devices`
- `nvidia-smi`

2. Reboot and wait for instance to come back up (~ minute)

`sudo reboot`

3. Verify installation

`nvidia-smi`


### Set up docker environment

1. Install docker

- ubuntu 22.04:
`curl https://get.docker.com | sh \
&& sudo systemctl --now enable docker`

- ubuntu 20.04: follow guide below
`https://docs.docker.com/engine/install/ubuntu/`

2. Install the Nvidia Container Toolkit and restart docker

`distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
&& curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - \
&& curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list && sudo apt-get update && sudo apt-get install -y nvidia-docker2`

`sudo systemctl restart docker && sudo chmod 666 /var/run/docker.sock`


### Build container

TODO
14 changes: 14 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,20 @@ def main(argv):
tf.config.experimental.set_visible_devices([], "GPU")
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'

print("@@@@@@@@@")
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
print(e)
print("@@@@@@@@@")

if FLAGS.mode == "train":
# Create the working directory
tf.io.gfile.makedirs(FLAGS.workdir)
Expand Down
10 changes: 8 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
ml-collections==0.1.0
tensorflow-gan==2.0.0
tensorflow_io
tensorflow_io==0.17.1
tensorflow_datasets==3.1.0
tensorflow==2.4.0
tensorflow-addons==0.12.0
tensorboard==2.4.0
absl-py==0.10.0
flax==0.3.1
jax==0.2.8
jaxlib==0.1.59
jaxlib==0.1.60
tensorflow-probability==0.12.2
numpy==1.19.5
six==1.15.0
termcolor==1.1.0
wrapt==1.12.1
protobuf==3.17.3