Liquid4All · Rouzbehat78 · Apr 21, 2026 · Mar 26, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/README.md b/README.md
@@ -14,9 +14,11 @@
 </div>
 </br>
 
-A minimal fine-tuning repo for LFM2, fully built on Open Source.
+<p align="center">
+<a href="#-setup">Setup</a> · <a href="#-quickstart">Quickstart</a> · <a href="#-expected-dataset-formats">Dataset Formats</a> · <a href="#-tool-calling-datasets">Tool Calling</a> · <a href="#-resuming-training">Resuming Training</a> · <a href="#-evaluation-benchmarks">Benchmarks</a> · <a href="#-advanced-configuration">Advanced Config</a>
+</p>
 
-We support different acceleration backends, including GPU nodes of 8xH100 80GB (both single node and multi node) as well as Modal (H100, H200, B200, ..) in case you don't have your own GPUs.
+LEAP-Finetune is a minimal fine-tuning repo for LFM2, fully built on Open Source. It handles multi-gpu orchestration, dataset formatting and validation, and model checkpointing. We support different acceleration backends, including GPU nodes of 8xH100 80GB (both single node and multi node) as well as Modal (H100, H200, B200, ..) in case you don't have your own GPUs.
 
 For feature requests or if you have a different setup, reach out to [support@liquid.ai](mailto:support@liquid.ai) and tell us about your specific configuration.
 
@@ -248,9 +250,40 @@ When training is done, you can bundle your output checkpoint with `leap-bundle`
 
 > **Note**: VLM datasets commonly have images in a separate row and are referenced in the messages column. If your image URLs or Paths are in a separate column from your messages, you'll need to merge the images into the 'messages' section like above.
 
+### 🔧 Tool Calling Datasets
+
+Tool calls use LFM bracket notation pre-baked in the assistant `content` field. Tool definitions go in the system prompt, and tool responses use `role: "tool"`.
+
+```json
+{
+  "messages": [
+    {
+      "role": "system",
+      "content": "List of tools: [{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather for a city\",\"parameters\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\"}},\"required\":[\"location\"]}}},{\"type\":\"function\",\"function\":{\"name\":\"search_web\",\"description\":\"Search the web\",\"parameters\":{\"type\":\"object\",\"properties\":{\"query\":{\"type\":\"string\"}},\"required\":[\"query\"]}}},{\"type\":\"function\",\"function\":{\"name\":\"send_email\",\"description\":\"Send an email\",\"parameters\":{\"type\":\"object\",\"properties\":{\"to\":{\"type\":\"string\"},\"body\":{\"type\":\"string\"}},\"required\":[\"to\",\"body\"]}}}]"
+    },
+    { "role": "user", "content": "What's the weather in Boston?" },
+    {
+      "role": "assistant",
+      "content": "<|tool_call_start|>[get_weather(location=\"Boston\")]<|tool_call_end|>"
+    },
+    {
+      "role": "tool",
+      "content": "{\"temperature\": 72, \"condition\": \"sunny\"}"
+    },
+    { "role": "assistant", "content": "It's 72°F and sunny in Boston." }
+  ]
+}
+```
+
+- Tool calls must be pre-baked in `content` using `<|tool_call_start|>[func(args)]<|tool_call_end|>` bracket notation
+- Structured `tool_calls` fields (OpenAI format) are auto-converted if present
+- Foreign formats (e.g. `<tool_call>` XML) are rejected with an actionable error
+- Do not include `<|tool_response_start|>` / `<|tool_response_end|>` markers in `role: "tool"` messages — the LFM2 chat template adds these automatically during tokenization
+- **LFM2 models** additionally expect `<|tool_list_start|>` / `<|tool_list_end|>` around tool definitions in the system prompt. Include these in your data if training an LFM2 model; omit them for LFM2.5. The pipeline warns on mismatches and auto-strips `<|tool_list_start|>` when training LFM2.5.
+
 ## 🔄 Resuming Training
 
-If a run is interrupted (SLURM preemption, crash, etc.), you can resume from the last checkpoint with full optimizer state, LR schedule, and wandb continuity.
+If a run is interrupted (GPU timeout, crash, SLURM preemption, etc.), you can resume from the last checkpoint with full optimizer state, LR schedule, and wandb continuity.
 
 Add `resume_from_checkpoint` to your `training_config`:
 
@@ -266,7 +299,7 @@ training_config:
   resume_from_checkpoint: "/path/to/outputs/my_project/run_name/checkpoint-step-8000"
 ```
 
-**What gets restored:** model weights, optimizer states, LR scheduler position, training step counter, and RNG states.
+**What gets restored:** model weights, optimizer states, LR scheduler position, training step counter, and RNG states. **In order to resume a run,** `save_only_model` **\*needs to be set to** `False`.
 
 **Wandb continuity:** The wandb run ID is saved to `<run_dir>/.wandb_run_id` automatically. On resume, it restores the same wandb run. Fresh runs always get a new wandb run.
 

diff --git a/src/leap_finetune/data_loaders/dataset_loader.py b/src/leap_finetune/data_loaders/dataset_loader.py
@@ -1,6 +1,6 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Literal
+from typing import Callable, Literal
 
 from datasets import Dataset, load_dataset
 
@@ -17,13 +17,18 @@ class DatasetLoader:
 
     dataset_path: str
     dataset_type: Literal["sft", "dpo", "vlm_sft"]
+    model_name: str | None = None
     limit: int | None = None
     split: str = "train"
     test_size: float = 0.2
     subset: str | None = None
     # Prepended to relative image paths in VLM datasets (e.g. "/data/images")
     image_root: str | None = None
     cache_dataset: bool = False
+    # Optional preprocessing function: takes Ray Dataset, returns Ray Dataset
+    # Applied before validation - use for custom filtering, transforms, joins, etc.
+    preprocess_fn: Callable | None = field(default=None, repr=False)
+    _validated: bool = field(default=False, repr=False)
 
     def __post_init__(self):
         if not (0 < self.test_size < 1):
@@ -32,15 +37,19 @@ def __post_init__(self):
             )
 
     def quick_validate(self) -> None:
-        """Fast validation on ~10 samples. Raises ValueError on issues."""
+        """Fast validation on ~10 samples. Raises ValueError on issues. No-ops if already called."""
+        if self._validated:
+            return
         quick_validate_schema(
             dataset_path=self.dataset_path,
             dataset_type=self.dataset_type,
             subset=self.subset,
             split=self.split,
             num_samples=10,
             image_root=self.image_root,
+            model_name=self.model_name,
         )
+        self._validated = True
 
     def to_ray_dataset(self):
         """Create a lazy Ray Dataset from the source."""

diff --git a/src/leap_finetune/data_loaders/ray_data_utils.py b/src/leap_finetune/data_loaders/ray_data_utils.py
@@ -10,6 +10,7 @@
 
 from .dataset_loader import DatasetLoader
 from .tokenize_data import tokenize_and_pack_sft, tokenize_dpo_dataset
+from .tool_call_utils import get_tool_normalizer
 from .validate_loader import get_row_filter, normalize_columns
 
 logger = logging.getLogger(__name__)
@@ -138,6 +139,14 @@ def create_ray_datasets(
     normalizer = normalize_columns(loader.dataset_type, image_root=loader.image_root)
     ds = ds.map(normalizer)
 
+    # Normalize tool call format (strip wrong markers, convert structured tool_calls)
+    if loader.model_name and loader.dataset_type in ("sft", "dpo"):
+        from leap_finetune.utils.model_utils import get_model_family
+
+        model_family = get_model_family(loader.model_name)
+        tool_normalizer = get_tool_normalizer(model_family)
+        ds = ds.map(tool_normalizer)
+
     # Filter invalid rows using Ray's native filter (pure Python, Ray handles Arrow)
     if loader.dataset_type == "vlm_sft":
         console.print(