feature importance

hombit · hombit · commit ff2aab02bca8 · 2026-03-19T12:34:31.000-04:00
diff --git a/docs/pre_executed/demo.ipynb b/docs/pre_executed/demo.ipynb
diff --git a/src/uncle_val/datasets/dp1.py b/src/uncle_val/datasets/dp1.py
@@ -6,6 +6,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+from nested_pandas import NestedFrame
 from upath import UPath
 
 from uncle_val.variability_detectors import get_combined_variability_detector
@@ -83,10 +84,10 @@ def _split_light_curves_by_band(
         single_band["band"] = band
 
         single_band["object_mag"] = single_band[f"{band}_psfMag"]
-        single_band = single_band.drop(columns=[f"{band}_psfMag" for band in LSDB_BANDS])
+        single_band = single_band.drop(columns=[f"{b}_psfMag" for b in bands])
 
         single_band["extendedness"] = single_band[f"{band}_extendedness"]
-        single_band = single_band.drop(columns=[f"{band}_extendedness" for band in LSDB_BANDS])
+        single_band = single_band.drop(columns=[f"{b}_extendedness" for b in bands])
 
         single_band_dfs.append(single_band)
 
@@ -357,6 +358,7 @@ def dp1_catalog_multi_band(
     phot: Literal["PSF"],
     mode: Literal["forced"],
     variability_detectors: Sequence[Callable] | Literal["all"] = "all",
+    pre_filter_partition: Callable[[NestedFrame], NestedFrame] | None = None,
 ):
     """Rubin DP1 LSDB catalog, bands are one-hot encoded.
 
@@ -397,6 +399,10 @@ def dp1_catalog_multi_band(
         Which variability detectors are to pass to
         `get_combined_variability_detector()`, default passing `None` which means
         using all of them.
+    pre_filter_partition : callable or None
+        Optional function applied to each catalog partition before any other
+        processing. Receives a ``NestedFrame`` and returns a filtered
+        ``NestedFrame``.
 
     Returns
     -------
@@ -421,6 +427,9 @@ def dp1_catalog_multi_band(
         read_visit_cols=True,
     )
 
+    if pre_filter_partition is not None:
+        catalog = catalog.map_partitions(pre_filter_partition)
+
     if variability_detectors == "all":
         variability_detectors = None
     var_detector = get_combined_variability_detector(variability_detectors)
diff --git a/src/uncle_val/learning/lsdb_dataset.py b/src/uncle_val/learning/lsdb_dataset.py
@@ -22,17 +22,23 @@ def _reduce_all_columns_wrapper(*args, columns=None, udf, **kwargs):
 
 
 def _process_lc(
-    row: dict[str, object], *, n_src: int, lc_col: str, length_col: str, rng: np.random.Generator
+    row: dict[str, object],
+    *,
+    n_src: int,
+    subsample_src: bool,
+    lc_col: str,
+    length_col: str,
+    rng: np.random.Generator,
 ) -> dict[str, np.ndarray]:
     lc_length = row.pop(length_col)
-    idx = rng.choice(lc_length, size=n_src, replace=False)
+    idx = rng.choice(lc_length, size=n_src, replace=False) if subsample_src else np.arange(lc_length)
 
     result: dict[str, np.ndarray] = {}
     for col, value in row.items():
         if col.startswith(f"{lc_col}."):
             result[col] = value[idx]
         else:
-            result[f"{lc_col}.{col}"] = np.full(n_src, value)
+            result[f"{lc_col}.{col}"] = np.full(len(idx), value)
     return result
 
 
@@ -41,6 +47,7 @@ def _process_partition(
     pixel: HealpixPixel,
     *,
     n_src: int,
+    subsample_src: bool,
     lc_col: str,
     id_col: str,
     hash_range: tuple[int, int] | None,
@@ -77,6 +84,7 @@ def _process_partition(
         columns=columns,
         udf=_process_lc,
         n_src=n_src,
+        subsample_src=subsample_src,
         lc_col=lc_col,
         length_col=length_col,
         rng=rng,
@@ -92,6 +100,7 @@ def lsdb_nested_series_data_generator(
     id_col: str = "id",
     client: dask.distributed.Client | None,
     n_src: int,
+    subsample_src: bool = True,
     partitions_per_chunk: int | None,
     hash_range: tuple[int, int] | None = None,
     loop: bool = False,
@@ -101,8 +110,10 @@ def lsdb_nested_series_data_generator(
 
     The data is pre-fetched on the background, 'n_workers' number
     of partitions per time (derived from `client` object).
-    It filters out light curves with less than `n_src` observations,
-    and selects `n_src` random observations per light curve.
+    Filters out light curves with fewer than `n_src` observations.
+    If `subsample_src` is ``True``, selects exactly `n_src` random observations
+    per light curve. If ``False``, all observations from qualifying light curves
+    are included.
 
     Parameters
     ----------
@@ -118,7 +129,12 @@ def lsdb_nested_series_data_generator(
         value. If Dask client is given, the data would be fetched on the
         background.
     n_src : int
-        Number of random observations per light curve.
+        Minimum number of observations required per light curve. Also the
+        subsample target when `subsample_src` is ``True``.
+    subsample_src : bool, optional
+        If ``True`` (default), randomly subsample exactly `n_src` observations
+        per light curve. If ``False``, include all observations from qualifying
+        light curves.
     partitions_per_chunk : int
         Number of `catalog` partitions load in memory simultaneously.
         This changes the randomness.
@@ -151,6 +167,7 @@ def lsdb_nested_series_data_generator(
             _process_partition,
             include_pixel=True,
             n_src=n_src,
+            subsample_src=subsample_src,
             lc_col=lc_col,
             id_col=id_col,
             hash_range=hash_range,
@@ -205,7 +222,8 @@ class LSDBIterableDataset(IterableDataset):
         Number of batches to yield. If `splits` is used, it will be the size
         of the first subset.
     n_src : int
-        Number of random observations per light curve.
+        Number of random observations per light curve. Light curves with fewer
+        than `n_src` observations are filtered out.
     partitions_per_chunk : int or None
         Number of `catalog` partitions per time, if None it is derived
         from the number of dask workers associated with `Client` (one if
diff --git a/src/uncle_val/pipelines/__init__.py b/src/uncle_val/pipelines/__init__.py
@@ -1,14 +1,13 @@
 from .dp1_constant_magerr import run_dp1_constant_magerr
+from .dp1_feature_importance import run_dp1_feature_importance
 from .dp1_linear_flux_err import run_dp1_linear_flux_err
 from .dp1_mlp import run_dp1_mlp
-from .plotting import make_plots, plot_shap_summary
-from .validation_set_utils import compute_shap_values
+from .plotting import make_plots
 
 __all__ = (
-    "compute_shap_values",
     "make_plots",
-    "plot_shap_summary",
     "run_dp1_constant_magerr",
+    "run_dp1_feature_importance",
     "run_dp1_linear_flux_err",
     "run_dp1_mlp",
 )
diff --git a/src/uncle_val/pipelines/dp1_mlp.py b/src/uncle_val/pipelines/dp1_mlp.py
@@ -1,6 +1,8 @@
+from collections.abc import Callable, Sequence
 from pathlib import Path
 
 import torch
+from nested_pandas import NestedFrame
 
 from uncle_val.datasets.dp1 import dp1_catalog_multi_band
 from uncle_val.learning.losses import UncleLoss
@@ -26,6 +28,8 @@ def run_dp1_mlp(
     log_activations: bool = False,
     snapshot_every: int = 128,
     device: torch.device | str = "cpu",
+    bands: Sequence[str] = "ugrizy",
+    pre_filter_partition: Callable[[NestedFrame], NestedFrame] | None = None,
 ) -> tuple[Path, list[str]]:
     """Run the training for DP1 with the linear model on fluxes and errors
 
@@ -68,6 +72,12 @@ def run_dp1_mlp(
         Whether to log validation activations with TensorBoard session.
     device : torch.device | str
         Torch device to use for training.
+    bands : sequence of str
+        Bands to include, subset of ``ugrizy``. Defaults to all six bands.
+    pre_filter_partition : callable or None
+        Optional function applied to each catalog partition before any other
+        processing. Receives a ``NestedFrame`` and returns a filtered
+        ``NestedFrame``.
 
     Returns
     -------
@@ -76,15 +86,14 @@ def run_dp1_mlp(
     list[str]
         List of columns to use as model inputs.
     """
-    bands = "ugrizy"
-
     catalog = dp1_catalog_multi_band(
         root=dp1_root,
         bands=bands,
         obj="science",
         img="cal",
         phot="PSF",
         mode="forced",
+        pre_filter_partition=pre_filter_partition,
     ).map_partitions(lambda df: df.drop(columns=["band", "object_mag", "coord_ra", "coord_dec"]))
 
     columns = [
diff --git a/src/uncle_val/pipelines/plotting.py b/src/uncle_val/pipelines/plotting.py
@@ -302,56 +302,6 @@ def _plot_magn_vs_uu(
     ax.legend()
 
 
-def plot_shap_summary(
-    shap_values: np.ndarray,
-    feature_data: np.ndarray,
-    input_names: list[str],
-    *,
-    output_path: str | Path | None = None,
-    title: str = "SHAP Feature Importance",
-) -> plt.Figure:
-    """Plot a SHAP beeswarm summary for the predicted uncertainty factor ``u``.
-
-    Each dot represents one observation, positioned by its SHAP value (impact
-    on ``u``) and coloured by the raw feature value (red = high, blue = low).
-
-    Parameters
-    ----------
-    shap_values : np.ndarray, shape ``(n_samples, n_features)``
-        SHAP values as returned by
-        :func:`~uncle_val.pipelines.validation_set_utils.compute_shap_values`.
-    feature_data : np.ndarray, shape ``(n_samples, n_features)``
-        Raw feature values corresponding to *shap_values*.
-    input_names : list of str
-        Feature names in the order of the last dimension.
-    output_path : str, Path, or None
-        If given, save the figure to this path.
-    title : str
-        Figure title.
-
-    Returns
-    -------
-    matplotlib.figure.Figure
-        The created figure.
-    """
-    import shap
-
-    plt.close("all")
-    explanation = shap.Explanation(
-        values=shap_values,
-        data=feature_data,
-        feature_names=input_names,
-    )
-    shap.plots.beeswarm(explanation, show=False, max_display=len(input_names))
-    fig = plt.gcf()
-    fig.suptitle(title, y=1.01)
-
-    if output_path is not None:
-        fig.savefig(output_path, bbox_inches="tight")
-
-    return fig
-
-
 def make_plots(
     dp1_root: str | Path,
     *,
diff --git a/src/uncle_val/pipelines/splits.py b/src/uncle_val/pipelines/splits.py
@@ -1,3 +1,3 @@
-TRAIN_SPLIT = 0.0, 0.75
-VALIDATION_SPLIT = 0.75, 0.85
+TRAIN_SPLIT = 0.0, 0.6
+VALIDATION_SPLIT = 0.6, 0.85
 TEST_SPLIT = 0.85, 1.0
diff --git a/src/uncle_val/pipelines/training_loop.py b/src/uncle_val/pipelines/training_loop.py
@@ -12,18 +12,15 @@
 from torch.utils.tensorboard import SummaryWriter
 from tqdm.auto import tqdm
 
+from uncle_val.datasets.materialized import MaterializedDataLoaderContext
+from uncle_val.learning.feature_importance import compute_shap_values, plot_shap_summary
 from uncle_val.learning.losses import UncleLoss
 from uncle_val.learning.lsdb_dataset import LSDBIterableDataset
 from uncle_val.learning.models import BaseUncleModel
 from uncle_val.learning.training import train_step
-from uncle_val.pipelines.plotting import plot_shap_summary
-from uncle_val.pipelines.splits import TRAIN_SPLIT, VALIDATION_SPLIT
+from uncle_val.pipelines.splits import TEST_SPLIT, TRAIN_SPLIT, VALIDATION_SPLIT
 from uncle_val.pipelines.utils import _launch_tfboard
-from uncle_val.pipelines.validation_set_utils import (
-    ValidationDataLoaderContext,
-    compute_shap_values,
-    get_val_stats,
-)
+from uncle_val.pipelines.validation_set_utils import get_val_stats
 
 
 def get_val_workers(client: Client, device: torch.device) -> list[object] | None:
@@ -147,7 +144,7 @@ def training_loop(
             device=device,
         )
 
-        with ValidationDataLoaderContext(validation_dataset_lsdb, tmp_validation_dir) as val_dataloader:
+        with MaterializedDataLoaderContext(validation_dataset_lsdb, tmp_validation_dir) as val_dataloader:
             val_stats_future: Future | None = None
             mean_val_loss_i = 0
 
@@ -268,19 +265,33 @@ def snapshot(i):
             snapshot(i_train_batch)
             snapshot(i_train_batch)
 
-            if best_model_path is not None and model.input_names:
+        if best_model_path is not None and model.input_names:
+            test_dataset_lsdb = LSDBIterableDataset(
+                catalog=catalog,
+                columns=columns,
+                client=client,
+                batch_lc=val_batch_size,
+                n_src=n_src,
+                partitions_per_chunk=n_workers * 8,
+                loop=False,
+                hash_range=TEST_SPLIT,
+                seed=2,
+                device=device,
+            )
+            tmp_test_dir = output_dir / "test_shap"
+            with MaterializedDataLoaderContext(test_dataset_lsdb, tmp_test_dir) as test_dataloader:
                 shap_values, feature_data = compute_shap_values(
                     model_path=best_model_path,
-                    data_loader=val_dataloader,
+                    data_loader=test_dataloader,
                     device=device,
                 )
-                fig = plot_shap_summary(
-                    shap_values,
-                    feature_data,
-                    input_names=model.input_names,
-                    output_path=output_dir / "feature_importance.png",
-                )
-                summary_writer.add_figure("Feature importance", fig)
+            fig = plot_shap_summary(
+                shap_values,
+                feature_data,
+                input_names=model.input_names,
+                output_path=output_dir / "feature_importance.png",
+            )
+            summary_writer.add_figure("Feature importance", fig)
 
     model.eval()
     summary_writer.add_graph(model, train_batch[0])
diff --git a/src/uncle_val/pipelines/validation_set_utils.py b/src/uncle_val/pipelines/validation_set_utils.py