[1144] Extra fixes (#1148)

tjhunter · clessig · iluise · commit ecaf28e34f48 · 2025-10-28T17:52:34.000+01:00
* Fixed problem in inferecne

* more fixes

* fixes

* lint

* lint

---------

Co-authored-by: Christian Lessig &lt;christian.lessig@ecmwf.int&gt;
diff --git a/integration_tests/small1_test.py b/integration_tests/small1_test.py
@@ -172,8 +172,9 @@ def assert_train_loss_below_threshold(run_id):
     )
     # Check that the loss does not explode in a single epoch
     # This is meant to be a quick test, not a convergence test
-    assert loss_metric < 1.25, (
-        f"'stream.ERA5.loss_mse.loss_avg' is {loss_metric}, expected to be below 0.25"
+    target = 1.5
+    assert loss_metric < target, (
+        f"'stream.ERA5.loss_mse.loss_avg' is {loss_metric}, expected to be below {target}"
     )
 
 
diff --git a/packages/common/src/weathergen/common/config.py b/packages/common/src/weathergen/common/config.py
@@ -86,7 +86,8 @@ def load_model_config(run_id: str, epoch: int | None, model_path: str | None) ->
         path = Path(model_path)
         fname = path / run_id / _get_model_config_file_name(run_id, epoch)
         assert fname.exists(), (
-            "The fallback path to the model does not exist. Please provide a `model_path`."
+            "The fallback path to the model does not exist. Please provide a `model_path`.",
+            fname,
         )
 
     _logger.info(f"Loading config from specified run_id and epoch: {fname}")
diff --git a/packages/common/src/weathergen/common/io.py b/packages/common/src/weathergen/common/io.py
@@ -30,6 +30,11 @@
 _logger = logging.getLogger(__name__)
 
 
+def is_ndarray(obj: typing.Any) -> bool:
+    """Check if object is an ndarray (wraps the linter warning)."""
+    return isinstance(obj, (np.ndarray))  # noqa: TID251
+
+
 @dataclasses.dataclass
 class IOReaderData:
     """
@@ -58,10 +63,10 @@ def create(cls, other: typing.Any) -> "IOReaderData":
 
         other should be such an instance.
         """
-        coords = other.coords
-        geoinfos = other.geoinfos
-        data = other.data
-        datetimes = other.datetimes
+        coords = np.asarray(other.coords)
+        geoinfos = np.asarray(other.geoinfos)
+        data = np.asarray(other.data)
+        datetimes = np.asarray(other.datetimes)
 
         n_datapoints = len(data)
 
@@ -130,22 +135,22 @@ class OutputDataset:
     item_key: ItemKey
 
     # (datapoints, channels, ens)
-    data: zarr.Array  # wrong type => array like
+    data: zarr.Array | NDArray  # wrong type => array like
 
     # (datapoints,)
-    times: zarr.Array
+    times: zarr.Array | NDArray
 
     # (datapoints, 2)
-    coords: zarr.Array
+    coords: zarr.Array | NDArray
 
     # (datapoints, geoinfos) geoinfos are stream dependent => 0 for most gridded data
-    geoinfo: zarr.Array
+    geoinfo: zarr.Array | NDArray
 
     channels: list[str]
     geoinfo_channels: list[str]
 
     @functools.cached_property
-    def arrays(self) -> dict[str, zarr.Array]:
+    def arrays(self) -> dict[str, zarr.Array | NDArray]:
         """Iterate over the arrays and their names."""
         return {
             "data": self.data,
@@ -236,7 +241,8 @@ def write_zarr(self, item: OutputItem):
         """Write one output item to the zarr store."""
         group = self._get_group(item.key, create=True)
         for dataset in item.datasets:
-            self._write_dataset(group, dataset)
+            if dataset is not None:
+                self._write_dataset(group, dataset)
 
     def get_data(self, sample: int, stream: str, forecast_step: int) -> OutputItem:
         """Get datasets for the output item matching the arguments."""
@@ -285,6 +291,7 @@ def _write_arrays(self, dataset_group: zarr.Group, dataset: OutputDataset):
             self._create_dataset(dataset_group, array_name, array)
 
     def _create_dataset(self, group: zarr.Group, name: str, array: NDArray):
+        assert is_ndarray(array), f"Expected ndarray but got: {type(array)}"
         if array.size == 0:  # sometimes for geoinfo
             chunks = None
         else:
@@ -394,20 +401,10 @@ def extract(self, key: ItemKey) -> OutputItem:
             target_data = np.zeros((0, len(self.target_channels[stream_idx])), dtype=np.float32)
             preds_data = np.zeros((0, len(self.target_channels[stream_idx])), dtype=np.float32)
         else:
-            target_data = (
-                self.targets[offset_key.forecast_step][stream_idx][0][datapoints]
-                .cpu()
-                .detach()
-                .numpy()
-            )
-            preds_data = (
-                self.predictions[offset_key.forecast_step][stream_idx][0]
-                .transpose(1, 0)
-                .transpose(1, 2)[datapoints]
-                .cpu()
-                .detach()
-                .numpy()
-            )
+            target_data = self.targets[offset_key.forecast_step][stream_idx][0][datapoints]
+            preds_data = self.predictions[offset_key.forecast_step][stream_idx][0].transpose(
+                1, 2, 0
+            )[datapoints]
 
         data_coords = self._extract_coordinates(stream_idx, offset_key, datapoints)
 
@@ -423,6 +420,8 @@ def extract(self, key: ItemKey) -> OutputItem:
         else:
             source_dataset = None
 
+        assert is_ndarray(target_data), f"Expected ndarray but got: {type(target_data)}"
+        assert is_ndarray(preds_data), f"Expected ndarray but got: {type(preds_data)}"
         return OutputItem(
             key=key,
             source=source_dataset,
@@ -501,10 +500,10 @@ def _extract_sources(self, sample, stream_idx, key):
         source_dataset = OutputDataset(
             "source",
             key,
-            source.data,
-            source.datetimes,
-            source.coords,
-            source.geoinfos,
+            np.asarray(source.data),
+            np.asarray(source.datetimes),
+            np.asarray(source.coords),
+            np.asarray(source.geoinfos),
             channels,
             geoinfo_channels,
         )
diff --git a/src/weathergen/datasets/multi_stream_data_sampler.py b/src/weathergen/datasets/multi_stream_data_sampler.py
@@ -313,12 +313,12 @@ def reset(self):
         self.tokenizer.reset_rng(self.rng)
 
     ###################################################
-    def denormalize_source_channels(self, stream_id, data):
+    def denormalize_source_channels(self, stream_id, data) -> torch.Tensor:
         # TODO: with multiple ds per stream we need to distinguish these here
         return self.streams_datasets[stream_id][0].denormalize_source_channels(data)
 
     ###################################################
-    def denormalize_target_channels(self, stream_id, data):
+    def denormalize_target_channels(self, stream_id, data) -> torch.Tensor:
         # TODO: with multiple ds per stream we need to distinguish these here
         return self.streams_datasets[stream_id][0].denormalize_target_channels(data)
 
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
@@ -19,6 +19,7 @@
 import torch
 import torch.nn as nn
 import tqdm
+from numpy.typing import NDArray
 from omegaconf import OmegaConf
 from torch import Tensor
 
@@ -240,7 +241,7 @@ def init_model_and_shard(self, cf, devices):
             for tensor in itertools.chain(model.parameters(), model.buffers()):
                 assert tensor.device == torch.device("meta")
 
-        # For reasons we do not yet fully understand, when using train continue in some 
+        # For reasons we do not yet fully understand, when using train continue in some
         # instances, FSDP2 does not register the forward_channels and forward_columns
         # functions in the embedding engine as forward functions. Thus, yielding a crash
         # because the input tensors are not converted to DTensors. This seems to primarily
@@ -518,9 +519,13 @@ def _prepare_logging(
 
         # assert len(targets_rt) == len(preds) and len(preds) == len(self.cf.streams)
         fsteps = len(targets_rt)
-        preds_all = [[[] for _ in self.cf.streams] for _ in range(fsteps)]
-        targets_all = [[[] for _ in self.cf.streams] for _ in range(fsteps)]
-        targets_lens = [[[] for _ in self.cf.streams] for _ in range(fsteps)]
+        preds_all: list[list[list[NDArray]]] = [
+            [[] for _ in self.cf.streams] for _ in range(fsteps)
+        ]
+        targets_all: list[list[list[NDArray]]] = [
+            [[] for _ in self.cf.streams] for _ in range(fsteps)
+        ]
+        targets_lens: list[list[list[int]]] = [[[] for _ in self.cf.streams] for _ in range(fsteps)]
 
         # TODO: iterate over batches here in future, and change loop order to batch, stream, fstep
         for fstep in range(len(targets_rt)):
@@ -542,8 +547,12 @@ def _prepare_logging(
                 dn_data = self.dataset_val.denormalize_target_channels
 
                 f32 = torch.float32
-                preds_all[fstep][i_strm] += [dn_data(i_strm, pred.to(f32)).detach().cpu()]
-                targets_all[fstep][i_strm] += [dn_data(i_strm, target.to(f32)).detach().cpu()]
+                preds_all[fstep][i_strm] += [
+                    np.asarray(dn_data(i_strm, pred.to(f32)).detach().cpu())
+                ]
+                targets_all[fstep][i_strm] += [
+                    np.asarray(dn_data(i_strm, target.to(f32)).detach().cpu())
+                ]
 
         return (
             preds_all,

Original file line number	Diff line number	Diff line change
`@@ -172,8 +172,9 @@ def assert_train_loss_below_threshold(run_id):`
`172`	`172`	`)`
`173`	`173`	`# Check that the loss does not explode in a single epoch`
`174`	`174`	`# This is meant to be a quick test, not a convergence test`
`175`		`- assert loss_metric < 1.25, (`
`176`		`- f"'stream.ERA5.loss_mse.loss_avg' is {loss_metric}, expected to be below 0.25"`
	`175`	`+ target = 1.5`
	`176`	`+ assert loss_metric < target, (`
	`177`	`+ f"'stream.ERA5.loss_mse.loss_avg' is {loss_metric}, expected to be below {target}"`
`177`	`178`	`)`
`178`	`179`
`179`	`180`
Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,8 @@ def load_model_config(run_id: str, epoch: int \| None, model_path: str \| None) ->`
`86`	`86`	`path = Path(model_path)`
`87`	`87`	`fname = path / run_id / _get_model_config_file_name(run_id, epoch)`
`88`	`88`	`assert fname.exists(), (`
`89`		- "The fallback path to the model does not exist. Please provide a `model_path`."
	`89`	+ "The fallback path to the model does not exist. Please provide a `model_path`.",
	`90`	`+ fname,`
`90`	`91`	`)`
`91`	`92`
`92`	`93`	`_logger.info(f"Loading config from specified run_id and epoch: {fname}")`