fix(input cutout): CutoutInput improvements and upgrades (#355)

HCookie · web-flow · commit 00c64c031c59 · 2025-11-04T15:01:02.000Z
A couple improvements to the cutout input: - Remove `pre_processors` from top level cutout - Add `_input` to cutout states - Alter `CutoutInput` to take a list to provide an explicit ordering Closes #356
diff --git a/docs/inference/configs/inputs.rst b/docs/inference/configs/inputs.rst
@@ -150,7 +150,7 @@ consistently with what is done in ``anemoi-datasets``, see `here
 
    The ``cutout`` input nests the ``private_attributes`` of the sources
    states so may prevent usage of some keys. To restore these, use the
-   ``extract_source`` postprocessor.
+   ``extract_from_state`` postprocessor.
 
 To extract regions from different sources within the ``cutout`` input,
 your checkpoint must contain the cutout masks as supporting arrays. You
@@ -175,11 +175,13 @@ running ``anemoi-inference patch <your_checkpoint>``.
 
    input:
      cutout:
-       lam_0:
+       - lam_0:
          mars: {}
          mask: null
 
-An example configuration for the ``cutout`` input is shown below:
+An example configuration for the ``cutout`` input is shown below, the
+sources can be provided as a list of positional arguments, with each
+source specified as a mapping from source name to source configuration:
 
 .. literalinclude:: yaml/inputs_11.yaml
    :language: yaml
@@ -188,11 +190,11 @@ The different sources are specified exactly as you would for a single
 source, as shown in the previous sections.
 
 An easy way to then extract regions from the predicated state is to use
-the ``extract_source`` postprocessor, which will subset the state to the
-specified source. For example, to extract the ``lam_0`` source from the
-state, you can use the following configuration:
+the ``extract_from_state`` postprocessor, which will subset the state to
+the specified source. For example, to extract the ``lam_0`` source from
+the state, you can use the following configuration:
 
 .. code:: yaml
 
    post_processors:
-   - extract_source: 'lam_0'
+   - extract_from_state: 'lam_0'
diff --git a/docs/inference/configs/yaml/inputs_11.yaml b/docs/inference/configs/yaml/inputs_11.yaml
@@ -1,6 +1,6 @@
 input:
   cutout:
-    lam_0:
-      grib: <path_to_your_grib_file>
-    global:
-      grib: <path_to_your_grib_file>
+    - lam_0:
+        grib: <path_to_your_grib_file>
+    - global:
+        grib: <path_to_your_grib_file>
diff --git a/src/anemoi/inference/inputs/cutout.py b/src/anemoi/inference/inputs/cutout.py
@@ -11,31 +11,18 @@
 import logging
 from collections import defaultdict
 from collections.abc import Iterable
-from collections.abc import Mapping
 
 import numpy as np
 
 from anemoi.inference.input import Input
 from anemoi.inference.inputs import create_input
 from anemoi.inference.inputs import input_registry
 from anemoi.inference.types import Date
-from anemoi.inference.types import ProcessorConfig
 from anemoi.inference.types import State
 
 LOG = logging.getLogger(__name__)
 
 
-def contains_key(obj, key: str) -> bool:
-    """Recursively check if `key` exists anywhere in a nested config (dict/DotDict/lists)."""
-    if isinstance(obj, Mapping):
-        if key in obj:
-            return True
-        return any(contains_key(v, key) for v in obj.values())
-    if isinstance(obj, (list, tuple, set)):
-        return any(contains_key(v, key) for v in obj)
-    return False
-
-
 def _mask_and_combine_states(
     existing_state: State,
     new_state: State,
@@ -112,58 +99,48 @@ def _extract_and_add_private_attributes(
 class Cutout(Input):
     """Combines one or more LAMs into a global source using cutouts."""
 
-    # TODO: Does this need an ordering?
-
     def __init__(
         self,
         context,
-        *,
-        variables: list[str] | None = None,
-        pre_processors: list[ProcessorConfig] | None = None,
-        purpose: str | None = None,
-        **sources: dict[str, dict],
+        *args: dict[str, dict],
+        sources: list[dict[str, dict]] | None = None,
+        **kwargs,
     ):
         """Create a cutout input from a list of sources.
 
         Parameters
         ----------
         context : dict
             The context runner.
-        sources : dict of sources
-            A dictionary of sources to combine.
-        variables : list[str] | None
-            List of variables to be handled by the input, or None for a sensible default variables.
-        pre_processors : Optional[List[ProcessorConfig]], default None
-            Pre-processors to apply to the input. Note that pre_processors are applied to each sub-input.
-        purpose : Optional[str]
-            The purpose of the input.
+        sources : list[dict[str, dict]]
+            List of sources / inputs to combine, the order defines the order in which they are combined.
         """
+        if any(x in kwargs for x in ["lam_0", "global"]):  # Capture common update issues
+            raise KeyError(
+                "Cutout input has changed to set the sub-inputs as a list, if using the config, prefix each input with `-` to update."
+            )
+
+        super().__init__(context, pre_processors=None, **kwargs)
 
-        super().__init__(context, variables=variables, pre_processors=pre_processors, purpose=purpose)
+        if not sources:
+            sources = []
+        sources = [*args, *sources]
 
         self.sources: dict[str, Input] = {}
         self.masks: dict[str, np.ndarray | slice] = {}
 
-        for src, cfg in sources.items():
+        for inp in sources:
+            if not isinstance(inp, dict) or len(inp) != 1:
+                raise ValueError("Each source in cutout inputs must be a dict with a single key-value pair.")
+            src, cfg = next(iter(inp.items()))
+
             if isinstance(cfg, str):
                 mask = f"{src}/cutout_mask"
             else:
                 cfg = cfg.copy()
                 mask = cfg.pop("mask", f"{src}/cutout_mask")
 
-            if contains_key(cfg, "pre_processors"):
-                combined_pre_processors = (pre_processors or []).extend(cfg.get("pre_processors", []))
-                self.sources[src] = create_input(
-                    context, cfg, variables=variables, pre_processors=combined_pre_processors, purpose=purpose
-                )
-            else:
-                self.sources[src] = create_input(
-                    context,
-                    cfg,
-                    variables=variables,
-                    purpose=purpose,
-                    pre_processors=pre_processors,
-                )
+            self.sources[src] = create_input(context, cfg, variables=self.variables, purpose=self.purpose)
 
             if isinstance(mask, str):
                 self.masks[src] = self.sources[src].checkpoint.load_supporting_array(mask)
@@ -236,6 +213,8 @@ def create_input_state(self, *, date: Date | None, **kwargs) -> State:
         _private_attributes["_mask"] = _mask_private_attributes
 
         combined_state.update(_private_attributes)
+        combined_state["_input"] = self
+
         return combined_state
 
     def load_forcings_state(self, *, dates: list[Date], current_state: State) -> State:
@@ -264,4 +243,6 @@ def load_forcings_state(self, *, dates: list[Date], current_state: State) -> Sta
             combined_fields = _mask_and_combine_states(combined_fields, source_state, source_mask, source_state.keys())
 
         current_state["fields"] |= combined_fields
+        current_state["_input"] = self
+
         return current_state
diff --git a/src/anemoi/inference/inputs/dataset.py b/src/anemoi/inference/inputs/dataset.py
@@ -153,6 +153,8 @@ def create_input_state(self, *, date: Date | None = None, **kwargs) -> State:
             if self.context.trace:
                 self.context.trace.from_input(variable, self)
 
+        input_state["_input"] = self
+
         return input_state
 
     def load_forcings_state(self, *, dates: list[Date], current_state: State) -> State:
diff --git a/src/anemoi/inference/runners/default.py b/src/anemoi/inference/runners/default.py
@@ -448,7 +448,7 @@ def _combine_states(self, *states: dict[str, Any]) -> dict[str, Any]:
                     if not np.array_equal(combined[key], value):
                         raise ValueError(
                             f"Key '{key}' has different array values in the states: "
-                            f"{combined[key]} and {value}."
+                            f"{combined[key]} ({combined[key].shape}) and {value} ({value.shape})."
                             f" Input: {first_input} vs {this_input}."
                         )
                     continue
diff --git a/tests/integration/meteoswiss-sgm-cosmo/config.yaml b/tests/integration/meteoswiss-sgm-cosmo/config.yaml
@@ -23,7 +23,7 @@
     write_initial_state: false
     input:
       cutout:
-        lam_0:
+       - lam_0:
           grib:
             path: ${input:0}
             namer:
@@ -60,7 +60,7 @@
                   - lsm
                 - - shortName: TOT_PREC
                   - tp
-        global:
+       - global:
           grib: ${input:1}
     output:
       grib:
diff --git a/tests/unit/inputs/test_cutout.py b/tests/unit/inputs/test_cutout.py
@@ -67,11 +67,11 @@ def runner() -> None:
 def test_cutout_no_mask(runner: Runner):
     from anemoi.inference.inputs.cutout import Cutout
 
-    cutout_config = {
-        "lam": {"mask": None, "dummy": {}},
-        "global": {"mask": None, "dummy": {}},
-    }
-    cutout_input = Cutout(runner, variables=["2t"], **cutout_config)
+    cutout_config = [
+        {"lam": {"mask": None, "dummy": {}}},
+        {"global": {"mask": None, "dummy": {}}},
+    ]
+    cutout_input = Cutout(runner, variables=["2t"], sources=cutout_config)
     input_state = cutout_input.create_input_state(date=datetime.datetime.fromisoformat("2020-01-01T00:00"))
     number_of_grid_points = runner.checkpoint.number_of_grid_points
 
@@ -89,11 +89,11 @@ def test_cutout_no_mask(runner: Runner):
 def test_cutout_with_slice(runner: Runner):
     from anemoi.inference.inputs.cutout import Cutout
 
-    cutout_config = {
-        "lam": {"mask": slice(0, 10), "dummy": {}},
-        "global": {"mask": slice(10, 25), "dummy": {}},
-    }
-    cutout_input = Cutout(runner, variables=["2t"], **cutout_config)
+    cutout_config = [
+        {"lam": {"mask": slice(0, 10), "dummy": {}}},
+        {"global": {"mask": slice(10, 25), "dummy": {}}},
+    ]
+    cutout_input = Cutout(runner, variables=["2t"], sources=cutout_config)
     assert list(cutout_input.sources.keys()) == ["lam", "global"]
 
     input_state = cutout_input.create_input_state(date=datetime.datetime.fromisoformat("2020-01-01T00:00"))
@@ -120,11 +120,8 @@ def test_cutout_with_array(runner: Runner):
     global_mask = np.zeros(number_of_grid_points, dtype=bool)
     global_mask[10:25] = True
 
-    cutout_config = {
-        "lam": {"mask": lam_mask, "dummy": {}},
-        "global": {"mask": global_mask, "dummy": {}},
-    }
-    cutout_input = Cutout(runner, variables=["2t"], **cutout_config)
+    cutout_config = [{"lam": {"mask": lam_mask, "dummy": {}}}, {"global": {"mask": global_mask, "dummy": {}}}]
+    cutout_input = Cutout(runner, variables=["2t"], sources=cutout_config)
     input_state = cutout_input.create_input_state(date=datetime.datetime.fromisoformat("2020-01-01T00:00"))
 
     assert "_mask" in input_state

Original file line number	Diff line number	Diff line change
`@@ -448,7 +448,7 @@ def _combine_states(self, *states: dict[str, Any]) -> dict[str, Any]:`
`448`	`448`	`if not np.array_equal(combined[key], value):`
`449`	`449`	`raise ValueError(`
`450`	`450`	`f"Key '{key}' has different array values in the states: "`
`451`		`- f"{combined[key]} and {value}."`
	`451`	`+ f"{combined[key]} ({combined[key].shape}) and {value} ({value.shape})."`
`452`	`452`	`f" Input: {first_input} vs {this_input}."`
`453`	`453`	`)`
`454`	`454`	`continue`