diff --git a/.azure/gpu-benchmarks.yml b/.azure/gpu-benchmarks.yml index d01594ea83bdb..045c0cd45ccb9 100644 --- a/.azure/gpu-benchmarks.yml +++ b/.azure/gpu-benchmarks.yml @@ -96,7 +96,7 @@ jobs: - bash: python -m pytest parity_$(PACKAGE_NAME) -v --durations=0 env: PL_RUNNING_BENCHMARKS: "1" - PL_RUN_CUDA_TESTS: "1" + RUN_ONLY_CUDA_TESTS: "1" workingDirectory: tests/ displayName: "Testing: benchmarks" @@ -105,7 +105,7 @@ jobs: # without succeeded this could run even if the job has already failed condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric')) env: - PL_RUN_CUDA_TESTS: "1" + RUN_ONLY_CUDA_TESTS: "1" PL_RUN_STANDALONE_TESTS: "1" displayName: "Testing: fabric standalone tasks" timeoutInMinutes: "10" diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 1adf9bfff67f4..3c01e47a09f99 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -48,7 +48,7 @@ jobs: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) FREEZE_REQUIREMENTS: "1" PIP_CACHE_DIR: "/var/tmp/pip" - PL_RUN_CUDA_TESTS: "1" + RUN_ONLY_CUDA_TESTS: "1" container: image: $(image) # default shm size is 64m. Increase it to avoid: @@ -78,8 +78,6 @@ jobs: echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html" scope=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))') echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope" - python_ver=$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')") - echo "##vso[task.setvariable variable=PYTHON_VERSION_MM]$python_ver" displayName: "set env. vars" - bash: | echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index dc1b4daf03075..820831aae83f9 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -66,7 +66,7 @@ jobs: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) FREEZE_REQUIREMENTS: "1" PIP_CACHE_DIR: "/var/tmp/pip" - PL_RUN_CUDA_TESTS: "1" + RUN_ONLY_CUDA_TESTS: "1" container: image: $(image) # default shm size is 64m. Increase it to avoid: @@ -82,8 +82,6 @@ jobs: echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html" scope=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(pytorch="pytorch_lightning").get(n, n))') echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope" - python_ver=$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')") - echo "##vso[task.setvariable variable=PYTHON_VERSION_MM]$python_ver" displayName: "set env. vars" - bash: | echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}" diff --git a/.lightning/workflows/fabric.yml b/.lightning/workflows/fabric.yml new file mode 100644 index 0000000000000..edaf0837fe79e --- /dev/null +++ b/.lightning/workflows/fabric.yml @@ -0,0 +1,110 @@ +trigger: + push: + branches: ["master"] + pull_request: + branches: ["master"] + +timeout: "75" # minutes +machine: "L4_X_2" +parametrize: + matrix: {} + include: + # note that this is setting also all oldest requirements which is linked to Torch == 2.0 + - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" + PACKAGE_NAME: "fabric" + - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + PACKAGE_NAME: "fabric" + # - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + # PACKAGE_NAME: "fabric" + - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + PACKAGE_NAME: "lightning" + exclude: [] + +env: + FREEZE_REQUIREMENTS: "1" + RUN_ONLY_CUDA_TESTS: "1" + +run: | + whereis nvidia + nvidia-smi + python --version + pip --version + pip install -q fire wget packaging + set -ex + + CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda" + echo "Using CUDA version: ${CUDA_VERSION}" + CUDA_VERSION_M_M="${cuda_version%.*}" # Get major.minor by removing the last dot and everything after + CUDA_VERSION_MM="${CUDA_VERSION_M_M//'.'/''}" + TORCH_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html" + echo "Torch URL: ${TORCH_URL}" + COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))') + echo "collecting coverage for: ${COVERAGE_SOURCE}" + + if [ "${TORCH_VER}" == "2.1" ]; then + echo "Set oldest versions" + python .actions/assistant.py replace_oldest_ver + pip install "cython<3.0" wheel # for compatibility + fi + + echo "Adjust torch versions in requirements files" + PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") + pip install -q wget packaging + python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py + for fpath in `ls requirements/**/*.txt`; do \ + python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \ + done + + if [ "${PACKAGE_NAME}" == "fabric" ]; then + echo "Replaced PL imports" + pip install -U -q -r .actions/requirements.txt + python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_fabric" \ + --source_import="lightning.fabric" \ + --target_import="lightning_fabric" + python .actions/assistant.py copy_replace_imports --source_dir="./examples/fabric" \ + --source_import="lightning.fabric" \ + --target_import="lightning_fabric" + fi + + extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))") + pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}" + + python requirements/collect_env_details.py + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" + python requirements/pytorch/check-avail-extras.py + python -c "import bitsandbytes" + + echo "Testing: Fabric doctests" + if [ "${PACKAGE_NAME}" == "fabric" ]; then + cd src/ + python -m pytest lightning_fabric + cd .. + fi + + cd tests/ + echo "Testing: fabric standard" + python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50 + + echo "Testing: fabric standalone" + export PL_RUN_STANDALONE_TESTS=1 + wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh + bash ./run_standalone_tests.sh "tests_fabric" + + # echo "Reporting coverage" # todo + # python -m coverage report + # python -m coverage xml + # python -m coverage html + + # TODO: enable coverage + # # https://docs.codecov.com/docs/codecov-uploader + # curl -Os https://uploader.codecov.io/latest/linux/codecov + # chmod +x codecov + # ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ + # --flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure + # ls -l + cd .. + + echo "Testing: fabric examples" + cd examples/ + bash run_fabric_examples.sh --accelerator=cuda --devices=1 + bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml new file mode 100644 index 0000000000000..81063c3699769 --- /dev/null +++ b/.lightning/workflows/pytorch.yml @@ -0,0 +1,131 @@ +trigger: + push: + branches: ["master"] + pull_request: + branches: ["master"] + +timeout: "75" # minutes +machine: "L4_X_2" +parametrize: + matrix: {} + include: + # note that this is setting also all oldest requirements which is linked to Torch == 2.0 + - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1" + PACKAGE_NAME: "pytorch" + - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + PACKAGE_NAME: "pytorch" + # - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + # PACKAGE_NAME: "pytorch" + - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3" + PACKAGE_NAME: "lightning" + exclude: [] + +env: + FREEZE_REQUIREMENTS: "1" + RUN_ONLY_CUDA_TESTS: "1" + +run: | + whereis nvidia + nvidia-smi + python --version + pip --version + pip install -q fire wget packaging + set -ex + + CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda" + echo "Using CUDA version: ${CUDA_VERSION}" + CUDA_VERSION_M_M="${cuda_version%.*}" # Get major.minor by removing the last dot and everything after + CUDA_VERSION_MM="${CUDA_VERSION_M_M//'.'/''}" + TORCH_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html" + echo "Torch URL: ${TORCH_URL}" + COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="pytorch_lightning").get(n, n))') + echo "collecting coverage for: ${COVERAGE_SOURCE}" + + if [ "${TORCH_VER}" == "2.1" ]; then + recho "Set oldest versions" + python .actions/assistant.py replace_oldest_ver + pip install "cython<3.0" wheel # for compatibility + fi + + echo "Adjust torch versions in requirements files" + PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") + pip install -q wget packaging + python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py + for fpath in `ls requirements/**/*.txt`; do \ + python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \ + done + + if [ "${PACKAGE_NAME}" == "pytorch" ]; then + echo "Adjust PL imports" + pip install -U -q -r .actions/requirements.txt + python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_pytorch" \ + --source_import="lightning.fabric,lightning.pytorch" \ + --target_import="lightning_fabric,pytorch_lightning" + python .actions/assistant.py copy_replace_imports --source_dir="./examples/pytorch/basics" \ + --source_import="lightning.fabric,lightning.pytorch" \ + --target_import="lightning_fabric,pytorch_lightning" + fi + + extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))") + pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}" + + if [ "${PACKAGE_NAME}" == "pytorch" ]; then + echo "uninstall lightning to have just single package" + pip uninstall -y lightning + elif [ "${PACKAGE_NAME}" == "lightning" ]; then + echo "uninstall PL to have just single package" + pip uninstall -y pytorch-lightning + fi + + python requirements/collect_env_details.py + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'" + python requirements/pytorch/check-avail-extras.py + python -c "import bitsandbytes" + + echo "Testing: Pytorch doctests" + if [ "${PACKAGE_NAME}" == "pytorch" ]; then + cd src/ + python -m pytest pytorch_lightning + cd .. + fi + + echo "Get legacy checkpoints" + bash .actions/pull_legacy_checkpoints.sh + cd tests/legacy + # bash generate_checkpoints.sh + ls -lh checkpoints/ + cd ../.. + + cd tests/ + echo "Testing: fabric standard" + python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_pytorch/ -v --durations=50 + + echo "Testing: fabric standalone" + export PL_USE_MOCKED_MNIST=1 + export PL_RUN_STANDALONE_TESTS=1 + wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh + bash ./run_standalone_tests.sh "tests_pytorch" + + echo "Testing: PyTorch standalone tasks" + cd tests_pytorch/ + bash run_standalone_tasks.sh + + # echo "Reporting coverage" # todo + # python -m coverage report + # python -m coverage xml + # python -m coverage html + + # TODO: enable coverage + # # https://docs.codecov.com/docs/codecov-uploader + # curl -Os https://uploader.codecov.io/latest/linux/codecov + # chmod +x codecov + # ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ + # --flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure + # ls -l + cd ../.. + + echo "Testing: PyTorch examples" + cd examples/ + bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1 + bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=2 --trainer.strategy=ddp + bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=2 --trainer.strategy=ddp --trainer.precision=16 diff --git a/src/lightning/fabric/utilities/testing/_runif.py b/src/lightning/fabric/utilities/testing/_runif.py index 6f5d933f9dae3..88bdfd399a4a0 100644 --- a/src/lightning/fabric/utilities/testing/_runif.py +++ b/src/lightning/fabric/utilities/testing/_runif.py @@ -44,7 +44,7 @@ def _runif_reasons( """Construct reasons for pytest skipif. Args: - min_cuda_gpus: Require this number of gpus and that the ``PL_RUN_CUDA_TESTS=1`` environment variable is set. + min_cuda_gpus: Require this number of gpus and that the ``RUN_ONLY_CUDA_TESTS=1`` environment variable is set. min_torch: Require that PyTorch is greater or equal than this version. max_torch: Require that PyTorch is less than this version. min_python: Require that Python is greater or equal than this version. diff --git a/src/lightning/pytorch/utilities/testing/_runif.py b/src/lightning/pytorch/utilities/testing/_runif.py index 9c46913681143..3fac3b38f2b43 100644 --- a/src/lightning/pytorch/utilities/testing/_runif.py +++ b/src/lightning/pytorch/utilities/testing/_runif.py @@ -46,7 +46,7 @@ def _runif_reasons( """Construct reasons for pytest skipif. Args: - min_cuda_gpus: Require this number of gpus and that the ``PL_RUN_CUDA_TESTS=1`` environment variable is set. + min_cuda_gpus: Require this number of gpus and that the ``RUN_ONLY_CUDA_TESTS=1`` environment variable is set. min_torch: Require that PyTorch is greater or equal than this version. max_torch: Require that PyTorch is less than this version. min_python: Require that Python is greater or equal than this version. diff --git a/tests/tests_fabric/conftest.py b/tests/tests_fabric/conftest.py index 68f3f2cc38191..9d4a0b9462f2e 100644 --- a/tests/tests_fabric/conftest.py +++ b/tests/tests_fabric/conftest.py @@ -212,7 +212,7 @@ def pytest_collection_modifyitems(items: list[pytest.Function], config: pytest.C options = { "standalone": "PL_RUN_STANDALONE_TESTS", - "min_cuda_gpus": "PL_RUN_CUDA_TESTS", + "min_cuda_gpus": "RUN_ONLY_CUDA_TESTS", "tpu": "PL_RUN_TPU_TESTS", } if os.getenv(options["standalone"], "0") == "1" and os.getenv(options["min_cuda_gpus"], "0") == "1": diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index b02d9d089a354..f52d307b3d845 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -333,7 +333,7 @@ def pytest_collection_modifyitems(items: list[pytest.Function], config: pytest.C options = { "standalone": "PL_RUN_STANDALONE_TESTS", - "min_cuda_gpus": "PL_RUN_CUDA_TESTS", + "min_cuda_gpus": "RUN_ONLY_CUDA_TESTS", "tpu": "PL_RUN_TPU_TESTS", } if os.getenv(options["standalone"], "0") == "1" and os.getenv(options["min_cuda_gpus"], "0") == "1":