docs: fix docs tutorials and warn in training func for mismatched pipe names

percevalw · percevalw · commit 9e5dc9468bb0 · 2025-09-02T21:30:59.000+02:00
diff --git a/docs/tutorials/training-ner.md b/docs/tutorials/training-ner.md
@@ -233,7 +233,7 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
     import edsnlp
     from edsnlp.training import train, ScheduledOptimizer, TrainingData
     from edsnlp.metrics.ner import NerExactMetric
-    from edsnlp.training.loggers import CSVLogger, RichLogger, WandbLogger
+    from edsnlp.training.loggers import CSVLogger, RichLogger, WandBLogger
     import edsnlp.pipes as eds
     import torch
 
@@ -242,6 +242,7 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
     nlp.add_pipe(
         # The NER pipe will be a CRF model
         eds.ner_crf(
+            name="ner",
             mode="joint",
             target_span_getter="gold_spans",
             # Set spans as both to ents and in separate `ent.label` groups
@@ -280,19 +281,21 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
         optim=torch.optim.Adam,
         module=nlp,
         total_steps=max_steps,
-        groups={
-            "^transformer": {
-                "lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 0 "max_value": 5e-5,},
+        groups=[
+            {
+                "selector": "transformer",
+                "lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 0, "max_value": 5e-5,},
             },
-            "": {
-                "lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 3e-4 "max_value": 3e-4,},
+            {
+                "selector": ".*",
+                "lr": {"@schedules": "linear", "warmup_rate": 0.1, "start_value": 3e-4, "max_value": 3e-4,},
             },
-        },
+        ],
     )
 
     #
     loggers = [
-        CSVLogger(),
+        CSVLogger.draft(), # draft as we will let the train function specify the logging_dir
         RichLogger(
             fields={
                 "step": {},
diff --git a/docs/tutorials/training-span-classifier.md b/docs/tutorials/training-span-classifier.md
@@ -265,24 +265,26 @@ Visit the [`edsnlp.train` documentation][edsnlp.training.trainer.train] for a li
     # 🎛️ OPTIMIZER (here it will be the same as thedefault one)
     optimizer = ScheduledOptimizer.draft(  # (2)!
         optim=torch.optim.AdamW,
-        groups={
-            "biopsy_classifier[.]embedding": {
+        groups=[
+            {
+                "selector": "biopsy_classifier[.]embedding",
                 "lr": {
                     "@schedules": "linear",
                     "warmup_rate": 0.1,
                     "start_value": 0.,
                     "max_value": 5e-5,
                 },
             },
-            ".*": {
+            {
+                "selector": ".*",
                 "lr": {
                     "@schedules": "linear",
                     "warmup_rate": 0.1,
                     "start_value": 3e-4,
                     "max_value": 3e-4,
                 },
             },
-        }
+        ]
     )
 
     # 🚀 TRAIN
diff --git a/edsnlp/training/trainer.py b/edsnlp/training/trainer.py
@@ -676,6 +676,14 @@ def train(
         total_steps=max_steps,
     )
 
+    for td in train_data:
+        if not (td.pipe_names is None or td.pipe_names <= trainable_pipe_names):
+            raise ValueError(
+                f"Training data pipe names {td.pipe_names} should be a subset of "
+                f"the trainable pipe names {trainable_pipe_names}, or left to None "
+                f"use this dataset for all trainable components."
+            )
+
     for phase_i, pipe_names in enumerate(phases):
         trained_pipes_local: Dict[str, TorchComponent] = {
             n: nlp.get_pipe(n) for n in pipe_names
@@ -688,6 +696,14 @@ def train(
             if td.pipe_names is None or set(td.pipe_names) & set(pipe_names)
         ]
 
+        if len(phase_training_data) == 0:
+            raise ValueError(
+                f"No training data found for phase {phase_i + 1} with components "
+                f"{', '.join(pipe_names)}. Make sure that these components are "
+                f"listed in the 'pipe_names' attribute of at least one of the "
+                f"provided training data."
+            )
+
         with nlp.select_pipes(disable=trainable_pipe_names - set(pipe_names)):
             accelerator.print(f"Phase {phase_i + 1}: training {', '.join(pipe_names)}")
             set_seed(seed)
@@ -700,37 +716,32 @@ def train(
                     grad_params.add(param)
                 param.requires_grad_(has_grad_param)
 
-            accelerator.print(
-                "Optimizing groups:"
-                + "".join(
-                    "\n - {} weight tensors ({:,} parameters){}".format(
+            accelerator.print("Optimizing groups:")
+            for g in optim.param_groups:
+                accelerator.print(
+                    " - {} weight tensors ({:,} parameters){}".format(
                         len([p for p in g["params"] if p in grad_params]),
                         sum([p.numel() for p in g["params"] if p in grad_params]),
                         ": " + " & ".join(g.get("selectors", "*"))
                         if "selectors" in g
                         else "",
                     )
-                    for g in optim.param_groups
                 )
-            )
             accelerator.print(
                 f"Keeping frozen {len(all_params - grad_params):} weight tensors "
                 f"({sum(p.numel() for p in all_params - grad_params):,} parameters)"
             )
 
             nlp.train(True)
 
-            iterator = iter(
-                zip(
-                    *(
-                        td(nlp, device).set_processing(
-                            num_cpu_workers=num_workers,
-                            process_start_method="spawn",
-                        )
-                        for td in phase_training_data
-                    )
+            phase_datasets = [
+                td(nlp, device).set_processing(
+                    num_cpu_workers=num_workers,
+                    process_start_method="spawn",
                 )
-            )
+                for td in phase_training_data
+            ]
+            iterator = iter(zip(*(phase_datasets)))
             (accel_optim, trained_pipes) = accelerator.prepare(optim, trained_pipes)
             if hasattr(accel_optim.optimizer, "initialize"):
                 accel_optim.optimizer.initialize()