Skip to content

Commit 56fd993

Browse files
committed
extras: checkpoint patch tooling updates
1 parent 15dcd15 commit 56fd993

File tree

5 files changed

+299
-2
lines changed

5 files changed

+299
-2
lines changed
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
2+
--- a/cmake/external_projects/flashmla.cmake
3+
+++ b/cmake/external_projects/flashmla.cmake
4+
@@
5+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
6+
set(FlashMLA_SOURCES
7+
${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
8+
${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
9+
${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
10+
${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
11+
${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
12+
@@
13+
define_gpu_extension_target(
14+
_flashmla_C
15+
DESTINATION vllm
16+
LANGUAGE ${VLLM_GPU_LANG}
17+
SOURCES ${FlashMLA_SOURCES}
18+
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
19+
ARCHITECTURES ${VLLM_GPU_ARCHES}
20+
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
21+
USE_SABI 3
22+
WITH_SOABI)
23+
+ if(CUDAToolkit_INCLUDE_DIRS)
24+
+ target_include_directories(_flashmla_C PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
25+
+ endif()
26+
+ # _flashmla_C_LIBCUDACXX_INJECT
27+
+ if(CUDAToolkit_ROOT)
28+
+ foreach(_cxx_inc
29+
+ "${CUDAToolkit_ROOT}/targets/x86_64-linux/include"
30+
+ "${CUDAToolkit_ROOT}/targets/x86_64-linux/include/cccl"
31+
+ "${CUDAToolkit_ROOT}/include/cccl"
32+
+ )
33+
+ if(EXISTS "${_cxx_inc}")
34+
+ target_include_directories(_flashmla_C PRIVATE "${_cxx_inc}")
35+
+ endif()
36+
+ endforeach()
37+
+ endif()
38+
if(CUDAToolkit_INCLUDE_DIRS)
39+
target_include_directories(_flashmla_C PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
40+
endif()
41+
else()
42+
# Create an empty target for setup.py when not targeting sm90a systems
43+
add_custom_target(_flashmla_C)
44+
endif()

extras/podman/run.ps1

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ nvidia-smi || true
166166
if ($Progress) { $envs += @('PROGRESS_WATCH=1') }
167167
$envs += @('NVIDIA_VISIBLE_DEVICES=all')
168168
$envStr = ($envs | ForEach-Object { "export $_;" }) -join ' '
169-
$cmd = "$envStr apply-vllm-patches || true; chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh"
169+
$cmd = "$envStr (command -v apply-vllm-patches >/dev/null 2>&1 && apply-vllm-patches || bash extras/patches/apply_patches.sh) || true; chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh"
170170
if ($Progress) { podman exec -it $ContainerName bash -lc $cmd } else { podman exec $ContainerName bash -lc $cmd }
171171
exit $LASTEXITCODE
172172
}
@@ -267,7 +267,7 @@ rm -f /tmp/gpucheck.py
267267
} elseif ($Setup) {
268268
# Use robust setup entrypoint that finds the right script (extras/dev-setup.sh or image helper)
269269
# Avoid in-place edits on Windows-mounted files; run a CRLF-normalized temp copy instead
270-
$prefix = 'TMP_RUN=$(mktemp /tmp/run-dev-setup.XXXX.sh); tr -d "\r" < ./extras/podman/dev-setup.sh > "$TMP_RUN" || cp ./extras/podman/dev-setup.sh "$TMP_RUN"; chmod +x "$TMP_RUN" 2>/dev/null || true; apply-vllm-patches || true; '
270+
$prefix = 'TMP_RUN=$(mktemp /tmp/run-dev-setup.XXXX.sh); tr -d "\r" < ./extras/podman/dev-setup.sh > "$TMP_RUN" || cp ./extras/podman/dev-setup.sh "$TMP_RUN"; chmod +x "$TMP_RUN" 2>/dev/null || true; (command -v apply-vllm-patches >/dev/null 2>&1 && apply-vllm-patches || bash extras/patches/apply_patches.sh) || true; '
271271
$envPrefix = ''
272272
if ($Mirror) { $envPrefix += 'export LOCAL_MIRROR=1; ' }
273273
if ($Progress) { $envPrefix += 'export PROGRESS_WATCH=1; ' }
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Example only; copy to cn-modelhub-credentials.env and fill with real values.
2+
CN_HUB_TOKEN=example_token
3+
CN_HUB_USERNAME=example_user
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Example only; copy to hf-credentials.env and fill with real values. DO NOT COMMIT REAL TOKENS.
2+
HF_TOKEN=example_token
3+
HF_USERNAME=example_user

orig_apply_patches.sh.txt

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
# Normalize CRLF and re-exec if needed
5+
if grep -q $'\r' "$0" 2>/dev/null; then
6+
TMP_SELF=$(mktemp /tmp/apply_patches_self.XXXXXX.sh)
7+
tr -d '\r' < "$0" > "$TMP_SELF" || cp "$0" "$TMP_SELF"
8+
chmod +x "$TMP_SELF" 2>/dev/null || true
9+
exec "$TMP_SELF" "$@"
10+
fi
11+
12+
# Resolve paths
13+
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
14+
# Treat current working directory as repo root (wrapper cd's to /workspace)
15+
ROOT_DIR=${ROOT_DIR:-$(pwd)}
16+
# Prefer patches from repo under ./extras/patches; fall back to script dir (e.g., /tmp copy)
17+
PRIMARY_PATCH_DIR="${ROOT_DIR}/extras/patches"
18+
PATCH_DIR="$PRIMARY_PATCH_DIR"
19+
if [ ! -d "$PATCH_DIR" ] || ! ls "$PATCH_DIR"/*.diff >/dev/null 2>&1; then
20+
PATCH_DIR="$SCRIPT_DIR"
21+
fi
22+
23+
pushd "$ROOT_DIR" >/dev/null
24+
25+
shopt -s nullglob
26+
PATCHES=("${PATCH_DIR}"/*.diff)
27+
shopt -u nullglob
28+
29+
echo "[patches] Using ROOT_DIR=$ROOT_DIR"
30+
echo "[patches] Scanning ${PATCH_DIR} for .diff files"
31+
echo "[patches] Found ${#PATCHES[@]} .diff file(s) in ${PATCH_DIR}"
32+
for pp in "${PATCHES[@]}"; do echo " - $(basename "$pp")"; done
33+
34+
for p in "${PATCHES[@]}"; do
35+
echo "[patches] Applying ${p}"
36+
# Normalize EOL to a temp patch file
37+
TMP_PATCH=$(mktemp /tmp/patch.XXXXXX.diff)
38+
tr -d '\r' < "$p" > "$TMP_PATCH" 2>/dev/null || cp "$p" "$TMP_PATCH"
39+
if git apply --check "$TMP_PATCH" 2>/dev/null; then
40+
git apply "$TMP_PATCH" || true
41+
continue
42+
fi
43+
echo "[patches] git apply check failed for $(basename "$p"); attempting fallback if known"
44+
case "$(basename "$p")" in
45+
0001-cumem-alloc-env-fallback.diff)
46+
echo "[patches] Fallback: update cumem allocator env var preference"
47+
python - <<'PY'
48+
import io, os
49+
path = os.path.join('vllm','device_allocator','cumem.py')
50+
try:
51+
with io.open(path, 'r', encoding='utf-8', newline='') as f:
52+
src = f.read()
53+
except FileNotFoundError:
54+
raise SystemExit(0)
55+
if 'PYTORCH_ALLOC_CONF' in src:
56+
print('[patches] cumem already prefers PYTORCH_ALLOC_CONF; skipping')
57+
raise SystemExit(0)
58+
needle = 'conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")'
59+
if needle in src:
60+
new = src.replace(needle,
61+
'conf = os.environ.get("PYTORCH_ALLOC_CONF",\n'
62+
' os.environ.get("PYTORCH_CUDA_ALLOC_CONF", ""))')
63+
with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
64+
f.write(new)
65+
print('[patches] Applied cumem env var fallback edit')
66+
else:
67+
print('[patches] cumem pattern not found; skipping')
68+
PY
69+
;;
70+
0002-cub-reduce-to-sum-cuda13.diff)
71+
echo "[patches] Fallback will be handled by the post-pass rewrite"
72+
;;
73+
*)
74+
echo "[patches] Unknown patch; skipping fallback"
75+
;;
76+
esac
77+
done
78+
79+
echo "[patches] Post-pass: normalize CUB reductions to device lambdas for CUDA 13"
80+
python - <<'PY'
81+
import io, os, re
82+
83+
files = []
84+
for root, _, names in os.walk('csrc'):
85+
for n in names:
86+
if n.endswith(('.cu', '.cuh')):
87+
files.append(os.path.join(root, n))
88+
89+
def lam_for(op: str) -> str:
90+
if op == 'Sum':
91+
return '[] __device__ (auto a, auto b) { return a + b; }'
92+
if op == 'Max':
93+
return '[] __device__ (auto a, auto b) { return a > b ? a : b; }'
94+
return '[] __device__ (auto a, auto b) { return a < b ? a : b; }'
95+
96+
# Patterns
97+
pat_method = re.compile(r'(BlockReduce\([^)]*\))\s*\.\s*(Sum|Max|Min)\(\s*([\s\S]*?)\s*\)', re.DOTALL)
98+
pat_functor = re.compile(r'(BlockReduce\([^)]*\))\s*\.\s*Reduce\(\s*([\s\S]*?)\s*,\s*cub::(Sum|Max|Min)\s*(?:\(\)|\{\})\s*([\s\S]*?)\)', re.DOTALL)
99+
100+
changed_any = False
101+
for path in files:
102+
try:
103+
with io.open(path, 'r', encoding='utf-8', newline='') as f:
104+
src = f.read()
105+
except FileNotFoundError:
106+
continue
107+
108+
new_src = src
109+
110+
# Replace method form first
111+
def repl_method(m):
112+
recv, op, expr = m.group(1), m.group(2), (m.group(3) or '').strip()
113+
return f"{recv}.Reduce({expr}, {lam_for(op)})"
114+
new_src = pat_method.sub(repl_method, new_src)
115+
116+
# Replace functor form
117+
def repl_functor(m):
118+
recv, expr, op, tail = m.group(1), (m.group(2) or '').strip(), m.group(3), (m.group(4) or '').rstrip()
119+
return f"{recv}.Reduce({expr}, {lam_for(op)}{tail})"
120+
new_src = pat_functor.sub(repl_functor, new_src)
121+
122+
if new_src != src:
123+
with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
124+
f.write(new_src)
125+
print(f"[patches] Rewrote CUB reductions in {path}")
126+
changed_any = True
127+
128+
if not changed_any:
129+
print('[patches] Post-pass: no changes (already applied)')
130+
PY
131+
132+
# Also relax cumem allocator assert to allow user opting into expandable segments
133+
python - <<'PY'
134+
import io, os, re
135+
path = os.path.join('vllm','device_allocator','cumem.py')
136+
try:
137+
with io.open(path, 'r', encoding='utf-8') as f:
138+
src = f.read()
139+
except FileNotFoundError:
140+
print('[patches] cumem.py not found; skipping assert relax')
141+
else:
142+
new_src = src
143+
# Remove the multi-line assert block guarding expandable_segments
144+
new_src = re.sub(
145+
r"assert\s+\"expandable_segments:True\"\s+not\s+in\s+conf,\s*\\\n\s*\(.*?\)\s*\n",
146+
"",
147+
new_src,
148+
flags=re.DOTALL,
149+
)
150+
# If a single-line variant exists, remove it too
151+
new_src = re.sub(
152+
r"^\s*assert\s+\"expandable_segments:True\".*$\n",
153+
"",
154+
new_src,
155+
flags=re.MULTILINE,
156+
)
157+
if new_src != src:
158+
with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
159+
f.write(new_src)
160+
print('[patches] Relaxed expandable_segments assert in vllm/device_allocator/cumem.py')
161+
else:
162+
print('[patches] No expandable_segments assert to relax (already updated)')
163+
PY
164+
165+
# Ensure FlashMLA target sees CUDA CCCL/targets include dirs for <cuda/std/...>
166+
python - <<'PY'
167+
import io, os, re
168+
path = os.path.join('cmake','external_projects','flashmla.cmake')
169+
try:
170+
with io.open(path, 'r', encoding='utf-8') as f:
171+
src = f.read()
172+
except FileNotFoundError:
173+
print('[patches] flashmla.cmake not found; skipping CCCL include fix')
174+
else:
175+
if 'target_include_directories(_flashmla_C PRIVATE ${CUDAToolkit_INCLUDE_DIRS})' in src:
176+
print('[patches] FlashMLA already includes CUDAToolkit include dirs')
177+
else:
178+
# Insert after the WITH_SOABI) line of define_gpu_extension_target block
179+
new_src = re.sub(
180+
r"(define_gpu_extension_target\([\s\S]*?_flashmla_C[\s\S]*?WITH_SOABI\))",
181+
r"\1\n if(CUDAToolkit_INCLUDE_DIRS)\n target_include_directories(_flashmla_C PRIVATE ${CUDAToolkit_INCLUDE_DIRS})\n endif()",
182+
src,
183+
count=1,
184+
)
185+
if new_src != src:
186+
with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
187+
f.write(new_src)
188+
print('[patches] Added CUDAToolkit include dirs to FlashMLA target')
189+
else:
190+
# Fallback: append near the end before endif()
191+
idx = src.rfind('add_custom_target(_flashmla_C)')
192+
if idx == -1:
193+
appended = src + "\nif(CUDAToolkit_INCLUDE_DIRS)\n target_include_directories(_flashmla_C PRIVATE ${CUDAToolkit_INCLUDE_DIRS})\nendif()\n"
194+
else:
195+
appended = src
196+
with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
197+
f.write(appended)
198+
print('[patches] Appended CUDAToolkit include dirs to FlashMLA target (fallback)')
199+
PY
200+
201+
# Strengthen FlashMLA include path injection: add explicit libcudacxx targets paths if missing
202+
python - <<'PY'
203+
import io, os, re
204+
MARK = '# _flashmla_C_LIBCUDACXX_INJECT'
205+
path = os.path.join('cmake','external_projects','flashmla.cmake')
206+
try:
207+
with io.open(path, 'r', encoding='utf-8') as f:
208+
src = f.read()
209+
except FileNotFoundError:
210+
print('[patches] flashmla.cmake not found; skipping libcudacxx inject')
211+
else:
212+
if MARK in src:
213+
print('[patches] FlashMLA libcudacxx include injection already present')
214+
else:
215+
inject_block = f"""
216+
{MARK}
217+
if(CUDAToolkit_ROOT)
218+
# Explicit libcudacxx include roots for <cuda/std/...>
219+
foreach(_cxx_inc
220+
"${{CUDAToolkit_ROOT}}/targets/x86_64-linux/include"
221+
"${{CUDAToolkit_ROOT}}/targets/x86_64-linux/include/cccl"
222+
"${{CUDAToolkit_ROOT}}/include/cccl"
223+
)
224+
if(EXISTS "${_cxx_inc}")
225+
target_include_directories(_flashmla_C PRIVATE "${_cxx_inc}")
226+
endif()
227+
endforeach()
228+
endif()
229+
""".strip('\n') + '\n'
230+
231+
# Heuristic: place after first existing target_include_directories for _flashmla_C or after define block
232+
pattern = r'(target_include_directories\(_flashmla_C[^\n]*\n)'
233+
m = re.search(pattern, src)
234+
if m:
235+
pos = m.end(1)
236+
new_src = src[:pos] + inject_block + src[pos:]
237+
else:
238+
# Append near end
239+
new_src = src + '\n' + inject_block + '\n'
240+
with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
241+
f.write(new_src)
242+
print('[patches] Injected explicit libcudacxx include directories into FlashMLA target')
243+
PY
244+
245+
popd >/dev/null
246+
247+
echo "[patches] Done."

0 commit comments

Comments
 (0)