Skip to content

add/debug Lit CI #21002

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .azure/gpu-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ jobs:
- bash: python -m pytest parity_$(PACKAGE_NAME) -v --durations=0
env:
PL_RUNNING_BENCHMARKS: "1"
PL_RUN_CUDA_TESTS: "1"
RUN_ONLY_CUDA_TESTS: "1"
workingDirectory: tests/
displayName: "Testing: benchmarks"

Expand All @@ -105,7 +105,7 @@ jobs:
# without succeeded this could run even if the job has already failed
condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
env:
PL_RUN_CUDA_TESTS: "1"
RUN_ONLY_CUDA_TESTS: "1"
PL_RUN_STANDALONE_TESTS: "1"
displayName: "Testing: fabric standalone tasks"
timeoutInMinutes: "10"
4 changes: 1 addition & 3 deletions .azure/gpu-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
PIP_CACHE_DIR: "/var/tmp/pip"
PL_RUN_CUDA_TESTS: "1"
RUN_ONLY_CUDA_TESTS: "1"
container:
image: $(image)
# default shm size is 64m. Increase it to avoid:
Expand Down Expand Up @@ -78,8 +78,6 @@ jobs:
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
scope=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))')
echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope"
python_ver=$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")
echo "##vso[task.setvariable variable=PYTHON_VERSION_MM]$python_ver"
displayName: "set env. vars"
- bash: |
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}"
Expand Down
4 changes: 1 addition & 3 deletions .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ jobs:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
PIP_CACHE_DIR: "/var/tmp/pip"
PL_RUN_CUDA_TESTS: "1"
RUN_ONLY_CUDA_TESTS: "1"
container:
image: $(image)
# default shm size is 64m. Increase it to avoid:
Expand All @@ -82,8 +82,6 @@ jobs:
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
scope=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(pytorch="pytorch_lightning").get(n, n))')
echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope"
python_ver=$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")
echo "##vso[task.setvariable variable=PYTHON_VERSION_MM]$python_ver"
displayName: "set env. vars"
- bash: |
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}"
Expand Down
110 changes: 110 additions & 0 deletions .lightning/workflows/fabric.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
trigger:
push:
branches: ["master"]
pull_request:
branches: ["master"]

timeout: "75" # minutes
machine: "L4_X_2"
parametrize:
matrix: {}
include:
# note that this is setting also all oldest requirements which is linked to Torch == 2.0
- image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1"
PACKAGE_NAME: "fabric"
- image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3"
PACKAGE_NAME: "fabric"
# - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3"
# PACKAGE_NAME: "fabric"
- image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3"
PACKAGE_NAME: "lightning"
exclude: []

env:
FREEZE_REQUIREMENTS: "1"
RUN_ONLY_CUDA_TESTS: "1"

run: |
whereis nvidia
nvidia-smi
python --version
pip --version
pip install -q fire wget packaging
set -ex

CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda"
echo "Using CUDA version: ${CUDA_VERSION}"
CUDA_VERSION_M_M="${cuda_version%.*}" # Get major.minor by removing the last dot and everything after
CUDA_VERSION_MM="${CUDA_VERSION_M_M//'.'/''}"
TORCH_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html"
echo "Torch URL: ${TORCH_URL}"
COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))')
echo "collecting coverage for: ${COVERAGE_SOURCE}"

if [ "${TORCH_VER}" == "2.1" ]; then
echo "Set oldest versions"
python .actions/assistant.py replace_oldest_ver
pip install "cython<3.0" wheel # for compatibility
fi

echo "Adjust torch versions in requirements files"
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
pip install -q wget packaging
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
for fpath in `ls requirements/**/*.txt`; do \
python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
done

if [ "${PACKAGE_NAME}" == "fabric" ]; then
echo "Replaced PL imports"
pip install -U -q -r .actions/requirements.txt
python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_fabric" \
--source_import="lightning.fabric" \
--target_import="lightning_fabric"
python .actions/assistant.py copy_replace_imports --source_dir="./examples/fabric" \
--source_import="lightning.fabric" \
--target_import="lightning_fabric"
fi

extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))")
pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}"

python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
python requirements/pytorch/check-avail-extras.py
python -c "import bitsandbytes"

echo "Testing: Fabric doctests"
if [ "${PACKAGE_NAME}" == "fabric" ]; then
cd src/
python -m pytest lightning_fabric
cd ..
fi

cd tests/
echo "Testing: fabric standard"
python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50

echo "Testing: fabric standalone"
export PL_RUN_STANDALONE_TESTS=1
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
bash ./run_standalone_tests.sh "tests_fabric"

# echo "Reporting coverage" # todo
# python -m coverage report
# python -m coverage xml
# python -m coverage html

# TODO: enable coverage
# # https://docs.codecov.com/docs/codecov-uploader
# curl -Os https://uploader.codecov.io/latest/linux/codecov
# chmod +x codecov
# ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
# --flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
# ls -l
cd ..

echo "Testing: fabric examples"
cd examples/
bash run_fabric_examples.sh --accelerator=cuda --devices=1
bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp
131 changes: 131 additions & 0 deletions .lightning/workflows/pytorch.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
trigger:
push:
branches: ["master"]
pull_request:
branches: ["master"]

timeout: "75" # minutes
machine: "L4_X_2"
parametrize:
matrix: {}
include:
# note that this is setting also all oldest requirements which is linked to Torch == 2.0
- image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.1-cuda12.1.1"
PACKAGE_NAME: "pytorch"
- image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3"
PACKAGE_NAME: "pytorch"
# - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3"
# PACKAGE_NAME: "pytorch"
- image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.7-cuda12.6.3"
PACKAGE_NAME: "lightning"
exclude: []

env:
FREEZE_REQUIREMENTS: "1"
RUN_ONLY_CUDA_TESTS: "1"

run: |
whereis nvidia
nvidia-smi
python --version
pip --version
pip install -q fire wget packaging
set -ex

CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda"
echo "Using CUDA version: ${CUDA_VERSION}"
CUDA_VERSION_M_M="${cuda_version%.*}" # Get major.minor by removing the last dot and everything after
CUDA_VERSION_MM="${CUDA_VERSION_M_M//'.'/''}"
TORCH_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html"
echo "Torch URL: ${TORCH_URL}"
COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="pytorch_lightning").get(n, n))')
echo "collecting coverage for: ${COVERAGE_SOURCE}"

if [ "${TORCH_VER}" == "2.1" ]; then
recho "Set oldest versions"
python .actions/assistant.py replace_oldest_ver
pip install "cython<3.0" wheel # for compatibility
fi

echo "Adjust torch versions in requirements files"
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
pip install -q wget packaging
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
for fpath in `ls requirements/**/*.txt`; do \
python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
done

if [ "${PACKAGE_NAME}" == "pytorch" ]; then
echo "Adjust PL imports"
pip install -U -q -r .actions/requirements.txt
python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_pytorch" \
--source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning"
python .actions/assistant.py copy_replace_imports --source_dir="./examples/pytorch/basics" \
--source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning"
fi

extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))")
pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}"

if [ "${PACKAGE_NAME}" == "pytorch" ]; then
echo "uninstall lightning to have just single package"
pip uninstall -y lightning
elif [ "${PACKAGE_NAME}" == "lightning" ]; then
echo "uninstall PL to have just single package"
pip uninstall -y pytorch-lightning
fi

python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
python requirements/pytorch/check-avail-extras.py
python -c "import bitsandbytes"

echo "Testing: Pytorch doctests"
if [ "${PACKAGE_NAME}" == "pytorch" ]; then
cd src/
python -m pytest pytorch_lightning
cd ..
fi

echo "Get legacy checkpoints"
bash .actions/pull_legacy_checkpoints.sh
cd tests/legacy
# bash generate_checkpoints.sh
ls -lh checkpoints/
cd ../..

cd tests/
echo "Testing: fabric standard"
python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_pytorch/ -v --durations=50

echo "Testing: fabric standalone"
export PL_USE_MOCKED_MNIST=1
export PL_RUN_STANDALONE_TESTS=1
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
bash ./run_standalone_tests.sh "tests_pytorch"

echo "Testing: PyTorch standalone tasks"
cd tests_pytorch/
bash run_standalone_tasks.sh

# echo "Reporting coverage" # todo
# python -m coverage report
# python -m coverage xml
# python -m coverage html

# TODO: enable coverage
# # https://docs.codecov.com/docs/codecov-uploader
# curl -Os https://uploader.codecov.io/latest/linux/codecov
# chmod +x codecov
# ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
# --flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
# ls -l
cd ../..

echo "Testing: PyTorch examples"
cd examples/
bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1
bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=2 --trainer.strategy=ddp
bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=2 --trainer.strategy=ddp --trainer.precision=16
2 changes: 1 addition & 1 deletion src/lightning/fabric/utilities/testing/_runif.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _runif_reasons(
"""Construct reasons for pytest skipif.

Args:
min_cuda_gpus: Require this number of gpus and that the ``PL_RUN_CUDA_TESTS=1`` environment variable is set.
min_cuda_gpus: Require this number of gpus and that the ``RUN_ONLY_CUDA_TESTS=1`` environment variable is set.
min_torch: Require that PyTorch is greater or equal than this version.
max_torch: Require that PyTorch is less than this version.
min_python: Require that Python is greater or equal than this version.
Expand Down
2 changes: 1 addition & 1 deletion src/lightning/pytorch/utilities/testing/_runif.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def _runif_reasons(
"""Construct reasons for pytest skipif.

Args:
min_cuda_gpus: Require this number of gpus and that the ``PL_RUN_CUDA_TESTS=1`` environment variable is set.
min_cuda_gpus: Require this number of gpus and that the ``RUN_ONLY_CUDA_TESTS=1`` environment variable is set.
min_torch: Require that PyTorch is greater or equal than this version.
max_torch: Require that PyTorch is less than this version.
min_python: Require that Python is greater or equal than this version.
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_fabric/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def pytest_collection_modifyitems(items: list[pytest.Function], config: pytest.C

options = {
"standalone": "PL_RUN_STANDALONE_TESTS",
"min_cuda_gpus": "PL_RUN_CUDA_TESTS",
"min_cuda_gpus": "RUN_ONLY_CUDA_TESTS",
"tpu": "PL_RUN_TPU_TESTS",
}
if os.getenv(options["standalone"], "0") == "1" and os.getenv(options["min_cuda_gpus"], "0") == "1":
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_pytorch/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ def pytest_collection_modifyitems(items: list[pytest.Function], config: pytest.C

options = {
"standalone": "PL_RUN_STANDALONE_TESTS",
"min_cuda_gpus": "PL_RUN_CUDA_TESTS",
"min_cuda_gpus": "RUN_ONLY_CUDA_TESTS",
"tpu": "PL_RUN_TPU_TESTS",
}
if os.getenv(options["standalone"], "0") == "1" and os.getenv(options["min_cuda_gpus"], "0") == "1":
Expand Down
Loading