extras: checkpoint patch tooling updates

Zhuul · Zhuul · commit 56fd993ef20f · 2025-09-27T01:09:37.000+02:00
diff --git a/extras/patches/0002-flashmla-cudatoolkit-include.diff b/extras/patches/0002-flashmla-cudatoolkit-include.diff
@@ -0,0 +1,44 @@
+diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
+--- a/cmake/external_projects/flashmla.cmake
++++ b/cmake/external_projects/flashmla.cmake
+@@
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
+     set(FlashMLA_SOURCES
+         ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
+         ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
+         ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
+         ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
+         ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
+@@
+     define_gpu_extension_target(
+         _flashmla_C
+         DESTINATION vllm
+         LANGUAGE ${VLLM_GPU_LANG}
+         SOURCES ${FlashMLA_SOURCES}
+         COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+         ARCHITECTURES ${VLLM_GPU_ARCHES}
+         INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
+         USE_SABI 3
+         WITH_SOABI)
++    if(CUDAToolkit_INCLUDE_DIRS)
++        target_include_directories(_flashmla_C PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
++    endif()
++    # _flashmla_C_LIBCUDACXX_INJECT
++    if(CUDAToolkit_ROOT)
++        foreach(_cxx_inc
++            "${CUDAToolkit_ROOT}/targets/x86_64-linux/include"
++            "${CUDAToolkit_ROOT}/targets/x86_64-linux/include/cccl"
++            "${CUDAToolkit_ROOT}/include/cccl"
++          )
++          if(EXISTS "${_cxx_inc}")
++            target_include_directories(_flashmla_C PRIVATE "${_cxx_inc}")
++          endif()
++        endforeach()
++    endif()
+     if(CUDAToolkit_INCLUDE_DIRS)
+         target_include_directories(_flashmla_C PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+     endif()
+ else()
+     # Create an empty target for setup.py when not targeting sm90a systems
+     add_custom_target(_flashmla_C)
+ endif()
diff --git a/extras/podman/run.ps1 b/extras/podman/run.ps1
@@ -166,7 +166,7 @@ nvidia-smi || true
 		if ($Progress) { $envs += @('PROGRESS_WATCH=1') }
 		$envs += @('NVIDIA_VISIBLE_DEVICES=all')
 		$envStr = ($envs | ForEach-Object { "export $_;" }) -join ' '
-		$cmd = "$envStr apply-vllm-patches || true; chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh"
+		$cmd = "$envStr (command -v apply-vllm-patches >/dev/null 2>&1 && apply-vllm-patches || bash extras/patches/apply_patches.sh) || true; chmod +x ./extras/dev-setup.sh 2>/dev/null || true; ./extras/dev-setup.sh"
 		if ($Progress) { podman exec -it $ContainerName bash -lc $cmd } else { podman exec $ContainerName bash -lc $cmd }
 		exit $LASTEXITCODE
 	}
@@ -267,7 +267,7 @@ rm -f /tmp/gpucheck.py
 } elseif ($Setup) {
 	# Use robust setup entrypoint that finds the right script (extras/dev-setup.sh or image helper)
 	# Avoid in-place edits on Windows-mounted files; run a CRLF-normalized temp copy instead
-	$prefix = 'TMP_RUN=$(mktemp /tmp/run-dev-setup.XXXX.sh); tr -d "\r" < ./extras/podman/dev-setup.sh > "$TMP_RUN" || cp ./extras/podman/dev-setup.sh "$TMP_RUN"; chmod +x "$TMP_RUN" 2>/dev/null || true; apply-vllm-patches || true; '
+	$prefix = 'TMP_RUN=$(mktemp /tmp/run-dev-setup.XXXX.sh); tr -d "\r" < ./extras/podman/dev-setup.sh > "$TMP_RUN" || cp ./extras/podman/dev-setup.sh "$TMP_RUN"; chmod +x "$TMP_RUN" 2>/dev/null || true; (command -v apply-vllm-patches >/dev/null 2>&1 && apply-vllm-patches || bash extras/patches/apply_patches.sh) || true; '
 	$envPrefix = ''
 	if ($Mirror) { $envPrefix += 'export LOCAL_MIRROR=1; ' }
 	if ($Progress) { $envPrefix += 'export PROGRESS_WATCH=1; ' }
diff --git a/extras/secrets/cn-modelhub-credentials.env.example b/extras/secrets/cn-modelhub-credentials.env.example
@@ -0,0 +1,3 @@
+# Example only; copy to cn-modelhub-credentials.env and fill with real values.
+CN_HUB_TOKEN=example_token
+CN_HUB_USERNAME=example_user
diff --git a/extras/secrets/hf-credentials.env.example b/extras/secrets/hf-credentials.env.example
@@ -0,0 +1,3 @@
+# Example only; copy to hf-credentials.env and fill with real values. DO NOT COMMIT REAL TOKENS.
+HF_TOKEN=example_token
+HF_USERNAME=example_user
diff --git a/orig_apply_patches.sh.txt b/orig_apply_patches.sh.txt
@@ -0,0 +1,247 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Normalize CRLF and re-exec if needed
+if grep -q $'\r' "$0" 2>/dev/null; then
+  TMP_SELF=$(mktemp /tmp/apply_patches_self.XXXXXX.sh)
+  tr -d '\r' < "$0" > "$TMP_SELF" || cp "$0" "$TMP_SELF"
+  chmod +x "$TMP_SELF" 2>/dev/null || true
+  exec "$TMP_SELF" "$@"
+fi
+
+# Resolve paths
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+# Treat current working directory as repo root (wrapper cd's to /workspace)
+ROOT_DIR=${ROOT_DIR:-$(pwd)}
+# Prefer patches from repo under ./extras/patches; fall back to script dir (e.g., /tmp copy)
+PRIMARY_PATCH_DIR="${ROOT_DIR}/extras/patches"
+PATCH_DIR="$PRIMARY_PATCH_DIR"
+if [ ! -d "$PATCH_DIR" ] || ! ls "$PATCH_DIR"/*.diff >/dev/null 2>&1; then
+  PATCH_DIR="$SCRIPT_DIR"
+fi
+
+pushd "$ROOT_DIR" >/dev/null
+
+shopt -s nullglob
+PATCHES=("${PATCH_DIR}"/*.diff)
+shopt -u nullglob
+
+echo "[patches] Using ROOT_DIR=$ROOT_DIR"
+echo "[patches] Scanning ${PATCH_DIR} for .diff files"
+echo "[patches] Found ${#PATCHES[@]} .diff file(s) in ${PATCH_DIR}"
+for pp in "${PATCHES[@]}"; do echo "  - $(basename "$pp")"; done
+
+for p in "${PATCHES[@]}"; do
+  echo "[patches] Applying ${p}"
+  # Normalize EOL to a temp patch file
+  TMP_PATCH=$(mktemp /tmp/patch.XXXXXX.diff)
+  tr -d '\r' < "$p" > "$TMP_PATCH" 2>/dev/null || cp "$p" "$TMP_PATCH"
+  if git apply --check "$TMP_PATCH" 2>/dev/null; then
+    git apply "$TMP_PATCH" || true
+    continue
+  fi
+  echo "[patches] git apply check failed for $(basename "$p"); attempting fallback if known"
+  case "$(basename "$p")" in
+    0001-cumem-alloc-env-fallback.diff)
+      echo "[patches] Fallback: update cumem allocator env var preference"
+      python - <<'PY'
+import io, os
+path = os.path.join('vllm','device_allocator','cumem.py')
+try:
+  with io.open(path, 'r', encoding='utf-8', newline='') as f:
+    src = f.read()
+except FileNotFoundError:
+  raise SystemExit(0)
+if 'PYTORCH_ALLOC_CONF' in src:
+  print('[patches] cumem already prefers PYTORCH_ALLOC_CONF; skipping')
+  raise SystemExit(0)
+needle = 'conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")'
+if needle in src:
+  new = src.replace(needle,
+    'conf = os.environ.get("PYTORCH_ALLOC_CONF",\n'
+    '                              os.environ.get("PYTORCH_CUDA_ALLOC_CONF", ""))')
+  with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
+    f.write(new)
+  print('[patches] Applied cumem env var fallback edit')
+else:
+  print('[patches] cumem pattern not found; skipping')
+PY
+      ;;
+    0002-cub-reduce-to-sum-cuda13.diff)
+      echo "[patches] Fallback will be handled by the post-pass rewrite"
+      ;;
+    *)
+      echo "[patches] Unknown patch; skipping fallback"
+      ;;
+  esac
+done
+
+echo "[patches] Post-pass: normalize CUB reductions to device lambdas for CUDA 13"
+python - <<'PY'
+import io, os, re
+
+files = []
+for root, _, names in os.walk('csrc'):
+  for n in names:
+    if n.endswith(('.cu', '.cuh')):
+      files.append(os.path.join(root, n))
+
+def lam_for(op: str) -> str:
+  if op == 'Sum':
+    return '[] __device__ (auto a, auto b) { return a + b; }'
+  if op == 'Max':
+    return '[] __device__ (auto a, auto b) { return a > b ? a : b; }'
+  return '[] __device__ (auto a, auto b) { return a < b ? a : b; }'
+
+# Patterns
+pat_method = re.compile(r'(BlockReduce\([^)]*\))\s*\.\s*(Sum|Max|Min)\(\s*([\s\S]*?)\s*\)', re.DOTALL)
+pat_functor = re.compile(r'(BlockReduce\([^)]*\))\s*\.\s*Reduce\(\s*([\s\S]*?)\s*,\s*cub::(Sum|Max|Min)\s*(?:\(\)|\{\})\s*([\s\S]*?)\)', re.DOTALL)
+
+changed_any = False
+for path in files:
+  try:
+    with io.open(path, 'r', encoding='utf-8', newline='') as f:
+      src = f.read()
+  except FileNotFoundError:
+    continue
+
+  new_src = src
+
+  # Replace method form first
+  def repl_method(m):
+    recv, op, expr = m.group(1), m.group(2), (m.group(3) or '').strip()
+    return f"{recv}.Reduce({expr}, {lam_for(op)})"
+  new_src = pat_method.sub(repl_method, new_src)
+
+  # Replace functor form
+  def repl_functor(m):
+    recv, expr, op, tail = m.group(1), (m.group(2) or '').strip(), m.group(3), (m.group(4) or '').rstrip()
+    return f"{recv}.Reduce({expr}, {lam_for(op)}{tail})"
+  new_src = pat_functor.sub(repl_functor, new_src)
+
+  if new_src != src:
+    with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
+      f.write(new_src)
+    print(f"[patches] Rewrote CUB reductions in {path}")
+    changed_any = True
+
+if not changed_any:
+  print('[patches] Post-pass: no changes (already applied)')
+PY
+
+# Also relax cumem allocator assert to allow user opting into expandable segments
+python - <<'PY'
+import io, os, re
+path = os.path.join('vllm','device_allocator','cumem.py')
+try:
+  with io.open(path, 'r', encoding='utf-8') as f:
+    src = f.read()
+except FileNotFoundError:
+  print('[patches] cumem.py not found; skipping assert relax')
+else:
+  new_src = src
+  # Remove the multi-line assert block guarding expandable_segments
+  new_src = re.sub(
+    r"assert\s+\"expandable_segments:True\"\s+not\s+in\s+conf,\s*\\\n\s*\(.*?\)\s*\n",
+    "",
+    new_src,
+    flags=re.DOTALL,
+  )
+  # If a single-line variant exists, remove it too
+  new_src = re.sub(
+    r"^\s*assert\s+\"expandable_segments:True\".*$\n",
+    "",
+    new_src,
+    flags=re.MULTILINE,
+  )
+  if new_src != src:
+    with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
+      f.write(new_src)
+    print('[patches] Relaxed expandable_segments assert in vllm/device_allocator/cumem.py')
+  else:
+    print('[patches] No expandable_segments assert to relax (already updated)')
+PY
+
+# Ensure FlashMLA target sees CUDA CCCL/targets include dirs for <cuda/std/...>
+python - <<'PY'
+import io, os, re
+path = os.path.join('cmake','external_projects','flashmla.cmake')
+try:
+  with io.open(path, 'r', encoding='utf-8') as f:
+    src = f.read()
+except FileNotFoundError:
+  print('[patches] flashmla.cmake not found; skipping CCCL include fix')
+else:
+  if 'target_include_directories(_flashmla_C PRIVATE ${CUDAToolkit_INCLUDE_DIRS})' in src:
+    print('[patches] FlashMLA already includes CUDAToolkit include dirs')
+  else:
+    # Insert after the WITH_SOABI) line of define_gpu_extension_target block
+    new_src = re.sub(
+      r"(define_gpu_extension_target\([\s\S]*?_flashmla_C[\s\S]*?WITH_SOABI\))",
+      r"\1\n    if(CUDAToolkit_INCLUDE_DIRS)\n        target_include_directories(_flashmla_C PRIVATE ${CUDAToolkit_INCLUDE_DIRS})\n    endif()",
+      src,
+      count=1,
+    )
+    if new_src != src:
+      with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
+        f.write(new_src)
+      print('[patches] Added CUDAToolkit include dirs to FlashMLA target')
+    else:
+      # Fallback: append near the end before endif()
+      idx = src.rfind('add_custom_target(_flashmla_C)')
+      if idx == -1:
+        appended = src + "\nif(CUDAToolkit_INCLUDE_DIRS)\n    target_include_directories(_flashmla_C PRIVATE ${CUDAToolkit_INCLUDE_DIRS})\nendif()\n"
+      else:
+        appended = src
+      with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
+        f.write(appended)
+      print('[patches] Appended CUDAToolkit include dirs to FlashMLA target (fallback)')
+PY
+
+# Strengthen FlashMLA include path injection: add explicit libcudacxx targets paths if missing
+python - <<'PY'
+import io, os, re
+MARK = '# _flashmla_C_LIBCUDACXX_INJECT'
+path = os.path.join('cmake','external_projects','flashmla.cmake')
+try:
+  with io.open(path, 'r', encoding='utf-8') as f:
+    src = f.read()
+except FileNotFoundError:
+  print('[patches] flashmla.cmake not found; skipping libcudacxx inject')
+else:
+  if MARK in src:
+    print('[patches] FlashMLA libcudacxx include injection already present')
+  else:
+    inject_block = f"""
+{MARK}
+if(CUDAToolkit_ROOT)
+    # Explicit libcudacxx include roots for <cuda/std/...>
+    foreach(_cxx_inc
+        "${{CUDAToolkit_ROOT}}/targets/x86_64-linux/include"
+        "${{CUDAToolkit_ROOT}}/targets/x86_64-linux/include/cccl"
+        "${{CUDAToolkit_ROOT}}/include/cccl"
+      )
+      if(EXISTS "${_cxx_inc}")
+        target_include_directories(_flashmla_C PRIVATE "${_cxx_inc}")
+      endif()
+    endforeach()
+endif()
+""".strip('\n') + '\n'
+
+    # Heuristic: place after first existing target_include_directories for _flashmla_C or after define block
+    pattern = r'(target_include_directories\(_flashmla_C[^\n]*\n)'
+    m = re.search(pattern, src)
+    if m:
+      pos = m.end(1)
+      new_src = src[:pos] + inject_block + src[pos:]
+    else:
+      # Append near end
+      new_src = src + '\n' + inject_block + '\n'
+    with io.open(path, 'w', encoding='utf-8', newline='\n') as f:
+      f.write(new_src)
+    print('[patches] Injected explicit libcudacxx include directories into FlashMLA target')
+PY
+
+popd >/dev/null
+
+echo "[patches] Done."

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Example only; copy to cn-modelhub-credentials.env and fill with real values.`
	`2`	`+CN_HUB_TOKEN=example_token`
	`3`	`+CN_HUB_USERNAME=example_user`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Example only; copy to hf-credentials.env and fill with real values. DO NOT COMMIT REAL TOKENS.`
	`2`	`+HF_TOKEN=example_token`
	`3`	`+HF_USERNAME=example_user`