|
11 | 11 | - .github/workflows/flash_attention.yml
|
12 | 12 | repository_dispatch:
|
13 | 13 | types: benchmark_flash_attention
|
14 |
| - workflow_dispatch: |
| 14 | + workflow_dispatch: |
| 15 | + |
15 | 16 | jobs:
|
16 | 17 | benchmark-flash-attn:
|
17 | 18 | name: Flash Attention CuTe DSL Benchmark
|
18 | 19 | runs-on: linux.dgx.b200.8
|
19 |
| - container: |
20 |
| - # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/ |
21 |
| - image: nvcr.io/nvidia/pytorch:25.06-py3 |
22 |
| - options: --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 |
23 | 20 | steps:
|
24 |
| - - uses: actions/checkout@v4 |
| 21 | + - name: Checkout repository |
| 22 | + uses: actions/checkout@v4 |
| 23 | + |
| 24 | + - name: Checkout Flash Attention repository |
| 25 | + uses: actions/checkout@v4 |
25 | 26 | with:
|
26 |
| - repository: 'Dao-AILab/flash-attention' |
27 |
| - path: 'fa4' |
28 |
| - - name: Install CuTe DSL |
| 27 | + repository: Dao-AILab/flash-attention |
| 28 | + path: fa4 |
| 29 | + submodules: recursive |
| 30 | + |
| 31 | + - name: Setup GPU flags for docker run |
29 | 32 | run: |
|
30 |
| - set -x |
31 |
| - echo "Installing nvidia-cutlass-dsl" |
32 |
| - pip install nvidia-cutlass-dsl==4.1.0 |
33 |
| - - name: Buid and Run FlashAttention CuTe DSL |
| 33 | + echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" |
| 34 | +
|
| 35 | + - name: Run Flash Attention benchmark in Docker |
| 36 | + env: |
| 37 | + DOCKER_IMAGE: nvcr.io/nvidia/pytorch:25.06-py3 |
34 | 38 | run: |
|
35 |
| - set -x |
36 |
| - pushd fa4 |
37 |
| - python setup.py install |
| 39 | + set -eux |
| 40 | +
|
| 41 | + container_name=$(docker run \ |
| 42 | + ${GPU_FLAG} \ |
| 43 | + --ipc=host \ |
| 44 | + --ulimit memlock=-1 \ |
| 45 | + --ulimit stack=67108864 \ |
| 46 | + --tty \ |
| 47 | + --detach \ |
| 48 | + --security-opt seccomp=unconfined \ |
| 49 | + --shm-size=4g \ |
| 50 | + -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ |
| 51 | + -w /tmp/workspace \ |
| 52 | + "${DOCKER_IMAGE}" |
| 53 | + ) |
| 54 | +
|
| 55 | + # Install CuTe DSL |
| 56 | + docker exec -t "${container_name}" bash -c " |
| 57 | + set -x |
| 58 | + echo 'Installing nvidia-cutlass-dsl' |
| 59 | + pip install nvidia-cutlass-dsl==4.1.0 |
| 60 | + " |
| 61 | +
|
| 62 | + # Build and run FlashAttention CuTe DSL |
| 63 | + docker exec -t "${container_name}" bash -c " |
| 64 | + set -x |
| 65 | + pushd fa4 |
| 66 | + python setup.py install |
38 | 67 |
|
39 |
| - echo '<h1>B200 1000W</h1>' >> $GITHUB_STEP_SUMMARY |
40 |
| - nvidia-smi |
41 |
| - export PYTHONPATH=$(pwd) |
42 |
| - python benchmarks/benchmark_attn.py >> $GITHUB_STEP_SUMMARY |
| 68 | + echo '<h1>B200 1000W</h1>' >> /tmp/workspace/fa4_output.txt |
| 69 | + nvidia-smi |
| 70 | + export PYTHONPATH=\$(pwd) |
| 71 | + python benchmarks/benchmark_attn.py >> /tmp/workspace/fa4_output.txt |
| 72 | + popd |
| 73 | + " |
43 | 74 |
|
44 |
| - popd |
| 75 | + # Display results in GitHub step summary |
| 76 | + if [ -f fa4_output.txt ]; then |
| 77 | + cat fa4_output.txt >> $GITHUB_STEP_SUMMARY |
| 78 | + fi |
0 commit comments