diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2cf7201 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,37 @@ +# base nvidia CUDA image +FROM nvidia/cuda:11.8.0-devel-ubuntu20.04 + +# initialize directories +ARG HOME_DIR=/workspace +ARG LOG_DIR=${HOME_DIR}/logs +ARG TMP_DIR=${HOME_DIR}/tmp +RUN mkdir ${HOME_DIR} +RUN mkdir ${LOG_DIR} +RUN mkdir ${TMP_DIR} + +WORKDIR ${HOME_DIR} +COPY . . + +# install important dependencies +ARG DEBIAN_FRONTEND=noninteractive +RUN apt update && apt -y upgrade && \ + apt install software-properties-common -y && \ + add-apt-repository ppa:deadsnakes/ppa -y && \ + apt update && apt install python3.8 -y && \ + apt install -y python-is-python3 && apt install -y python3-pip + +RUN apt-get update && apt-get install -y build-essential \ + curl \ + vim \ + libcudnn8 + +RUN pip install -r requirements.txt + +# so jax can detect GPUs +RUN pip install -U jaxlib==0.1.60+cuda111 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + +# symlink +RUN ln -s /usr/local/cuda/lib64/libcusolver.so.11 /usr/local/cuda/lib64/libcusolver.so.10 +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64" + +# CMD ["python] \ No newline at end of file diff --git a/configs/default_cifar10_configs.py b/configs/default_cifar10_configs.py index 664054f..7fa2246 100644 --- a/configs/default_cifar10_configs.py +++ b/configs/default_cifar10_configs.py @@ -5,7 +5,8 @@ def get_default_configs(): config = ml_collections.ConfigDict() # training config.training = training = ml_collections.ConfigDict() - config.training.batch_size = 128 + # config.training.batch_size = 128 + config.training.batch_size = 2 training.n_iters = 1300001 training.snapshot_freq = 50000 training.log_freq = 50 diff --git a/docker.md b/docker.md new file mode 100644 index 0000000..2e2ece7 --- /dev/null +++ b/docker.md @@ -0,0 +1,52 @@ +# Mr Yang's `score_sde`, dockerized + +## OS Prereqs + +### Install GPU drivers + +1. Ensure packages are up to date & install + + `sudo apt update && sudo apt -y upgrade && sudo apt-get -y install ubuntu-drivers-common && sudo ubuntu-drivers autoinstall && sudo apt install nvidia-driver-515` + + Depending on the card, you'll want to install the most up-to-date driver + + - `t4`: 515 + - `v100`: 520 + + To check: + + - `ubuntu-drivers devices` + - `nvidia-smi` + +2. Reboot and wait for instance to come back up (~ minute) + + `sudo reboot` + +3. Verify installation + + `nvidia-smi` + + +### Set up docker environment + +1. Install docker + + - ubuntu 22.04: + `curl https://get.docker.com | sh \ + && sudo systemctl --now enable docker` + + - ubuntu 20.04: follow guide below + `https://docs.docker.com/engine/install/ubuntu/` + +2. Install the Nvidia Container Toolkit and restart docker + + `distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ + && curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - \ + && curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list && sudo apt-get update && sudo apt-get install -y nvidia-docker2` + + `sudo systemctl restart docker && sudo chmod 666 /var/run/docker.sock` + + +### Build container + +TODO \ No newline at end of file diff --git a/main.py b/main.py index 489d366..b64b924 100644 --- a/main.py +++ b/main.py @@ -38,6 +38,20 @@ def main(argv): tf.config.experimental.set_visible_devices([], "GPU") os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false' + print("@@@@@@@@@") + gpus = tf.config.list_physical_devices('GPU') + if gpus: + try: + # Currently, memory growth needs to be the same across GPUs + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + logical_gpus = tf.config.list_logical_devices('GPU') + print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") + except RuntimeError as e: + # Memory growth must be set before GPUs have been initialized + print(e) + print("@@@@@@@@@") + if FLAGS.mode == "train": # Create the working directory tf.io.gfile.makedirs(FLAGS.workdir) diff --git a/requirements.txt b/requirements.txt index 65beedc..49b7bb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ ml-collections==0.1.0 tensorflow-gan==2.0.0 -tensorflow_io +tensorflow_io==0.17.1 tensorflow_datasets==3.1.0 tensorflow==2.4.0 tensorflow-addons==0.12.0 @@ -8,4 +8,10 @@ tensorboard==2.4.0 absl-py==0.10.0 flax==0.3.1 jax==0.2.8 -jaxlib==0.1.59 +jaxlib==0.1.60 +tensorflow-probability==0.12.2 +numpy==1.19.5 +six==1.15.0 +termcolor==1.1.0 +wrapt==1.12.1 +protobuf==3.17.3 \ No newline at end of file