Liquid4All · Rouzbehat78 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,14 +13,11 @@ dependencies = [
     "deepspeed>=0.17.1; sys_platform == 'linux'",
     "torch>=2.7.1",
     "transformers>=5.3.0,<5.4.0",
-    "flash-attn>=2.8.0; sys_platform == 'linux'",
-    "numpy",
+    "numpy>=2.4.1",
     "trl>=1.0,<2.0",
-    "vllm>=0.19.0",
     "rich>=14.1.0",
     "pillow>=11.3.0",
     "mpi4py>=4.1.0; sys_platform == 'linux'",
-    "liger-kernel>=0.6.2; sys_platform == 'linux'",
     "wandb>=0.22.3",
     "ray==2.48.0",
     "pyyaml>=6.0",
@@ -36,6 +33,22 @@ dependencies = [
 rl-env = [
     "openenv-core>=0.1; sys_platform == 'linux'",
 ]
+# NVIDIA / CUDA GPU stack.
+# Install with: uv sync --extra cuda
+cuda = [
+    "flash-attn>=2.8.0; sys_platform == 'linux'",
+    "liger-kernel>=0.6.2; sys_platform == 'linux'",
+    "vllm>=0.19.0; sys_platform == 'linux'",
+]
+# AMD / ROCm GPU stack.
+# Install with: uv sync --extra rocm
+# Post-install for vLLM + flash-attn (vLLM ROCm wheels bundle flash-attn):
+#   uv pip uninstall vllm torch && uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/
+rocm = [
+    "torch>=2.7.1; sys_platform == 'linux'",
+    "torchvision>=0.24.1; sys_platform == 'linux'",
+    "liger-kernel>=0.6.2; sys_platform == 'linux'",
+]
 
 [project.scripts]
 leap-finetune = "leap_finetune:main"
@@ -50,11 +63,33 @@ environments = [
     "sys_platform == 'darwin'",
     "sys_platform == 'linux'",
 ]
-# vLLM 0.19.x still declares transformers<5, need to override it
+# cuda and rocm pull torch from different indexes — they cannot coexist in one env.
+conflicts = [
+    [{ extra = "cuda" }, { extra = "rocm" }],
+]
+
 override-dependencies = [
+    # ROCm torch depends on triton-rocm / pytorch-triton-rocm, but those wheels
+    # have broken RECORD files on the ROCm index.
+    "triton-rocm ; sys_platform == 'impossible'",
+    "pytorch-triton-rocm ; sys_platform == 'impossible'",
+    # vllm 0.19.x declares transformers<5; override so it resolves with 5.3+
     "transformers>=5.3.0,<5.4.0",
     "huggingface_hub>=1.0",
-    "numpy<2.3",
+    "numpy<2.5",
+]
+
+[[tool.uv.index]]
+name = "pytorch-rocm"
+url = "https://download.pytorch.org/whl/rocm7.1"
+explicit = true
+
+[tool.uv.sources]
+torch = [
+    { index = "pytorch-rocm", extra = "rocm" },
+]
+torchvision = [
+    { index = "pytorch-rocm", extra = "rocm" },
 ]
 
 [tool.pytest.ini_options]

diff --git a/rewards/README.md b/rewards/README.md
@@ -40,27 +40,27 @@ rewards:
   recipe: "./rewards/tasks/vlm_grounding/recipe.py::VLMGroundingIoURecipe"
   funcs:
     - "./rewards/length.py::length_reward"
-  weights: [0.1, 1.0, 0.05]   # recipe weights + the stacked func weight
+  weights: [0.1, 1.0, 0.05] # recipe weights + the stacked func weight
 ```
 
 ## Shipped primitives
 
-| File | Function | What it does | Required columns |
-|------|----------|--------------|------------------|
+| File          | Function          | What it does                                                                  | Required columns |
+| ------------- | ----------------- | ----------------------------------------------------------------------------- | ---------------- |
 | `accuracy.py` | `accuracy_reward` | Math accuracy via `math_verify` (re-export of `trl.rewards.accuracy_reward`). | `solution` (str) |
-| `length.py` | `length_reward` | Length-based shaping reward, scaled to `[0, 1]`. | none |
+| `length.py`   | `length_reward`   | Length-based shaping reward, scaled to `[0, 1]`.                              | none             |
 
 ## Shipped task bundles
 
 Full list in [`tasks/README.md`](tasks/README.md).
 
-| Task | Recipe | Reward shape |
-|------|--------|--------------|
-| VLM visual grounding | `tasks/vlm_grounding/recipe.py::VLMGroundingIoURecipe` | strict JSON format (0.1) + F1 of matched IoUs (1.0) |
+| Task                        | Recipe                                                  | Reward shape                                         |
+| --------------------------- | ------------------------------------------------------- | ---------------------------------------------------- |
+| VLM visual grounding        | `tasks/vlm_grounding/recipe.py::VLMGroundingIoURecipe`  | strict JSON format (0.1) + F1 of matched IoUs (1.0)  |
 | VLM visual grounding (CIoU) | `tasks/vlm_grounding/recipe.py::VLMGroundingCIoURecipe` | strict JSON format (0.1) + F1 of matched CIoUs (1.0) |
-| GSM8K | `tasks/gsm8k/recipe.py::GSM8KRecipe` | numeric exact match via `#### N` (1.0) |
-| MCQA | `tasks/mcqa/recipe.py::MCQARecipe` | letter match A..J (1.0) |
-| IFEval | `tasks/ifeval/recipe.py::IFEvalRecipe` | fraction of constraints satisfied (1.0) |
+| GSM8K                       | `tasks/gsm8k/recipe.py::GSM8KRecipe`                    | numeric exact match via `#### N` (1.0)               |
+| MCQA                        | `tasks/mcqa/recipe.py::MCQARecipe`                      | letter match A..J (1.0)                              |
+| IFEval                      | `tasks/ifeval/recipe.py::IFEvalRecipe`                  | fraction of constraints satisfied (1.0)              |
 
 ## Reward function signature
 
@@ -74,9 +74,8 @@ def reward_fn(completions, **kwargs) -> list[float | None]:
   string prompts pass through the raw string. Extract defensively:
   `c[0]["content"] if isinstance(c, list) else c`.
 - **`**kwargs`** — every other column in the dataset row is forwarded
-  as a keyword with the same name. TRL also forwards `prompts`,
-  `completion_ids`, `trainer_state`, and (when `rl_env` is used)
-  `env_reward`. Use `**kwargs` so unused fields are ignored.
+as a keyword with the same name. TRL also forwards `prompts`,
+`completion_ids`, `trainer_state`, and (when `rl_env`is used)`env_reward`. Use `\*\*kwargs` so unused fields are ignored.
 - **Return** — a list of floats, one per completion. Returning `None`
   for a sample marks it "not applicable" and drops it from advantage
   aggregation.

diff --git a/rewards/tasks/README.md b/rewards/tasks/README.md
@@ -16,9 +16,9 @@ rewards:
 Bounding-box grounding where the model outputs a bare JSON array of
 `{"label", "bbox"}` dicts in normalized `[0, 1]` coordinates.
 
-| Recipe | Reward | Default weights |
-|---|---|---|
-| `VLMGroundingIoURecipe` | Strict JSON format check + F1 of Hungarian-matched IoUs. | `strict_format_reward: 0.1`, `iou_f1_reward: 1.0` |
+| Recipe                   | Reward                                                                                                                                      | Default weights                                    |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------- |
+| `VLMGroundingIoURecipe`  | Strict JSON format check + F1 of Hungarian-matched IoUs.                                                                                    | `strict_format_reward: 0.1`, `iou_f1_reward: 1.0`  |
 | `VLMGroundingCIoURecipe` | Same F1 structure, but each matched pair uses Complete-IoU (IoU − center-distance − aspect-ratio). The Hungarian matcher also runs on CIoU. | `strict_format_reward: 0.1`, `ciou_f1_reward: 1.0` |
 
 **Required columns:** `prompt` (VLM messages list with image +
@@ -55,8 +55,8 @@ penalized). Plain IoU-F1 is typically enough for multi-object scenes.
 Exact-match reward on the final numeric answer, extracted via the
 `#### N` marker.
 
-| Recipe | Reward | Default weights |
-|---|---|---|
+| Recipe        | Reward                                                                                                         | Default weights     |
+| ------------- | -------------------------------------------------------------------------------------------------------------- | ------------------- |
 | `GSM8KRecipe` | Extract the final number via `#### N` (with last-number fallback) and compare to the gold. 1.0 / 0.0 / `None`. | `gsm8k_reward: 1.0` |
 
 **Required columns:** `prompt` (question), `solution` (bare numeric
@@ -88,8 +88,8 @@ Letter-match reward on the extracted answer choice. Supports
 `Answer: X` / `\boxed{X}` / trailing-letter patterns; the last match
 in the completion wins.
 
-| Recipe | Reward | Default weights |
-|---|---|---|
+| Recipe       | Reward                                                                                      | Default weights    |
+| ------------ | ------------------------------------------------------------------------------------------- | ------------------ |
 | `MCQARecipe` | Extract a letter A..J from the completion tail and compare to the gold. 1.0 / 0.0 / `None`. | `mcqa_reward: 1.0` |
 
 **Required columns:** `prompt` (question + labeled options),
@@ -120,8 +120,8 @@ training_config:
 Dense reward in `[0, 1]`: the fraction of supported constraints the
 completion satisfies.
 
-| Recipe | Reward | Default weights |
-|---|---|---|
+| Recipe         | Reward                                                                                                        | Default weights      |
+| -------------- | ------------------------------------------------------------------------------------------------------------- | -------------------- |
 | `IFEvalRecipe` | Parse a JSON constraint spec from `solution`, check each supported constraint, return the fraction that pass. | `ifeval_reward: 1.0` |
 
 **Supported constraint types:** `punctuation:no_comma`,
@@ -182,6 +182,7 @@ and the reward-function API contract.
 
    __all__ = ["YourTaskRecipe", "your_reward_fn"]
    ```
+
 4. Add a row to this README.
 5. Reference the recipe from YAML:
    `./rewards/tasks/<your_task>/recipe.py::YourTaskRecipe`.
diff --git a/src/leap_finetune/rl_envs/README.md b/src/leap_finetune/rl_envs/README.md
@@ -50,26 +50,26 @@ When `rl_env:` is set, the training loop:
 
 ```yaml
 rl_env:
-  source: "qgallouedec/echo_env"    # HF Hub repo-id OR installed env name
-  base_url: null                    # optional: connect to a running HTTP server
-  docker_image: null                # optional: override the default Docker image
-  env_vars: {}                      # optional: env vars forwarded to the env container
-  wait_timeout: 30.0                # seconds to wait for container startup
-  skip_install: false               # true → use GenericEnvClient (no package install)
-  trust_remote_code: true           # skip interactive confirmation (default true)
-  max_turns: 1                      # only 1 supported in the default adapter
-  reset_kwargs: {}                  # kwargs forwarded to env.reset() each episode
-  action_key: "message"             # dict key for env.step({action_key: text})
+  source: "qgallouedec/echo_env" # HF Hub repo-id OR installed env name
+  base_url: null # optional: connect to a running HTTP server
+  docker_image: null # optional: override the default Docker image
+  env_vars: {} # optional: env vars forwarded to the env container
+  wait_timeout: 30.0 # seconds to wait for container startup
+  skip_install: false # true → use GenericEnvClient (no package install)
+  trust_remote_code: true # skip interactive confirmation (default true)
+  max_turns: 1 # only 1 supported in the default adapter
+  reset_kwargs: {} # kwargs forwarded to env.reset() each episode
+  action_key: "message" # dict key for env.step({action_key: text})
 ```
 
 ## Picking a source
 
-| Situation | What to set |
-|-----------|-------------|
+| Situation                                                                      | What to set                                                            |
+| ------------------------------------------------------------------------------ | ---------------------------------------------------------------------- |
 | Env is published as an HF Space and you want to talk to it directly over HTTPS | `source`, `base_url: "https://<space>.hf.space"`, `skip_install: true` |
-| Env is published on the Hub and you want auto-install + auto-run | `source: "org/env-name"` (needs Docker) |
-| You're running the env locally via `docker run -p 8001:8001 ...` | `base_url: "http://localhost:8001"` |
-| You have a custom Docker image | `docker_image: "my-env:latest"` |
+| Env is published on the Hub and you want auto-install + auto-run               | `source: "org/env-name"` (needs Docker)                                |
+| You're running the env locally via `docker run -p 8001:8001 ...`               | `base_url: "http://localhost:8001"`                                    |
+| You have a custom Docker image                                                 | `docker_image: "my-env:latest"`                                        |
 
 **On Modal**, prefer the hosted-Space path — Modal's training
 container doesn't have Docker-in-Docker, so auto-install from Hub