diff --git a/documentation/changelog.rst b/documentation/changelog.rst index 900ebaabf3..1f7024f4ba 100644 --- a/documentation/changelog.rst +++ b/documentation/changelog.rst @@ -45,6 +45,7 @@ Infrastructure / Support Bugfixes ----------- +* Fix forecasting covariate assembly to retain the latest known value per regressor when regressors have different belief times for the same event [see `PR #2155 `_] * Fix ``StorageScheduler`` crash (``AttributeError: 'NoneType' object has no attribute 'event_resolution'``) when scheduling a site whose asset tree contains non-storage devices with only a ``power-capacity`` in their ``flex-model`` (no ``sensor`` key) [see `PR #2085 `_] * Fix DST transition handling by supporting both native Python ``datetime`` and pandas ``Timestamp`` objects in time series segment processing, preventing ``AttributeError`` when processing segments with differing UTC offsets [see `PR #2197 `_] * Fix forecasting regressor filtering to use only regressor beliefs known at the forecast ``belief_time`` [see `PR #2134 `_] diff --git a/flexmeasures/data/models/forecasting/pipelines/base.py b/flexmeasures/data/models/forecasting/pipelines/base.py index 120988d8ec..4de0287600 100644 --- a/flexmeasures/data/models/forecasting/pipelines/base.py +++ b/flexmeasures/data/models/forecasting/pipelines/base.py @@ -277,6 +277,48 @@ def _generate_splits( belief_timestamps_list : list[pd.Timestamp] """ + def _select_latest_per_regressor( + data: pd.DataFrame, + regressor_columns: list[str], + ) -> pd.DataFrame: + """Select latest non-null values per event and regressor. + + :param data: Frame with ``event_start``, ``belief_time``, + and regressor columns. + :param regressor_columns: Regressor columns to select independently. + :return: Wide frame with one row per ``event_start`` + and one selected value per regressor. + """ + keep = ["event_start", *regressor_columns] + if data.empty: + return data.iloc[0:0][keep].copy() + selected = ( + data[["event_start"]] + .drop_duplicates() + .sort_values("event_start") + .reset_index(drop=True) + ) + + for regressor in regressor_columns: + regressor_data = data[ + ["event_start", "belief_time", regressor] + ].dropna(subset=[regressor]) + if regressor_data.empty: + selected[regressor] = np.nan + continue + + idx = regressor_data.groupby("event_start")["belief_time"].idxmax() + selected_values = regressor_data.loc[ + idx, ["event_start", regressor] + ] + selected = selected.merge( + selected_values, + on="event_start", + how="left", + ) + + return selected[keep] + target_sensor_resolution = self.target_sensor.event_resolution # target_start is the timestamp of the event_start of the first event in realizations @@ -347,27 +389,26 @@ def _slice_closed( out.loc[:, "event_start"] = es.iloc[lo:hi].to_numpy() return out - def _latest_known_by_event_start( + def _latest_known_per_regressor( df_: pd.DataFrame, + regressor_columns: list[str], forecast_belief_time: pd.Timestamp, realized_only: bool = False, ) -> pd.DataFrame: - """Select one row per event using beliefs known at forecast belief time.""" - keep = [c for c in df_.columns if c not in ("belief_time")] + """Select latest regressor values known at forecast belief time.""" + keep = ["event_start", *regressor_columns] if df_.empty: return df_.iloc[0:0][keep].copy() known = df_.loc[df_["belief_time"] <= forecast_belief_time].copy() if realized_only: known = known.loc[known["belief_time"] > known["event_start"]] + else: + known = known.loc[known["belief_time"] <= known["event_start"]] if known.empty: return df_.iloc[0:0][keep].copy() - idx = known.groupby("event_start")["belief_time"].idxmax() - latest = ( - known.loc[idx].sort_values("event_start").reset_index(drop=True) - ) - return latest[keep] + return _select_latest_per_regressor(known, regressor_columns) target_list = [] past_covariates_list = [] @@ -408,8 +449,11 @@ def _latest_known_by_event_start( # Past covariates split if X_past_regressors_df is not None: - past_known = _latest_known_by_event_start( - X_past_regressors_df, belief_time, realized_only=True + past_known = _latest_known_per_regressor( + X_past_regressors_df, + self.past_regressors, + belief_time, + realized_only=True, ) past_slice = _slice_closed(past_known, target_start, target_end) past_covariates = self.detect_and_fill_missing_values( @@ -424,8 +468,11 @@ def _latest_known_by_event_start( # Future covariates (realized up to target_end + forecasts up to forecast_end) split if X_future_regressors_df is not None: - future_known = _latest_known_by_event_start( - X_future_regressors_df, belief_time, realized_only=True + future_known = _latest_known_per_regressor( + X_future_regressors_df, + self.future_regressors, + belief_time, + realized_only=True, ) realized_slice = _slice_closed( future_known, target_start, target_end @@ -444,23 +491,14 @@ def _latest_known_by_event_start( ) ].copy() - keep_fc = [ - c for c in X_future_regressors_df.columns if c != "belief_time" - ] - if fc_window.empty: - forecast_slice = fc_window.iloc[0:0][keep_fc].copy() - else: - # For each future event_start, pick the latest belief known - # at the simulated forecast belief time. - idx_fc = fc_window.groupby("event_start")[ - "belief_time" - ].idxmax() - forecast_slice = ( - fc_window.loc[idx_fc] - .sort_values("event_start") - .reset_index(drop=True) - ) - forecast_slice = forecast_slice[keep_fc] + # For each future event_start, pick the latest forecast belief known + # at the simulated forecast belief time. + forecast_slice = _latest_known_per_regressor( + fc_window, + self.future_regressors, + belief_time, + realized_only=False, + ) future_df = ( pd.concat([realized_slice, forecast_slice], ignore_index=True) diff --git a/flexmeasures/data/tests/test_forecasting_pipeline.py b/flexmeasures/data/tests/test_forecasting_pipeline.py index a14a47274b..c1d00b7fba 100644 --- a/flexmeasures/data/tests/test_forecasting_pipeline.py +++ b/flexmeasures/data/tests/test_forecasting_pipeline.py @@ -665,6 +665,209 @@ def test_prior_restricts_training_beliefs( ) +def test_future_regressor_split_selects_latest_known_value_per_regressor(monkeypatch): + target_sensor = type( + "SensorStub", + (), + {"name": "target", "id": 1, "event_resolution": timedelta(hours=1)}, + )() + future_regressor_a = type( + "SensorStub", + (), + {"name": "weather-a", "id": 2, "event_resolution": timedelta(hours=1)}, + )() + future_regressor_b = type( + "SensorStub", + (), + {"name": "weather-b", "id": 3, "event_resolution": timedelta(hours=1)}, + )() + + pipeline = BasePipeline( + target_sensor=target_sensor, + future_regressors=[future_regressor_a, future_regressor_b], + past_regressors=[], + n_steps_to_predict=1, + max_forecast_horizon=1, + forecast_frequency=1, + event_starts_after=datetime(2025, 1, 8, 9), + event_ends_before=datetime(2025, 1, 8, 10), + ) + forecast_belief_time = pd.Timestamp("2025-01-08T10:00:00") + regressor_a, regressor_b = pipeline.future_regressors + + df = pd.DataFrame( + [ + { + "event_start": pd.Timestamp("2025-01-08T09:00:00"), + "belief_time": forecast_belief_time - pd.Timedelta(minutes=30), + pipeline.target: None, + regressor_a: 3.0, + regressor_b: None, + }, + { + "event_start": pd.Timestamp("2025-01-08T09:00:00"), + "belief_time": forecast_belief_time, + pipeline.target: 1.0, + regressor_a: 4.0, + regressor_b: None, + }, + { + "event_start": pd.Timestamp("2025-01-08T10:00:00"), + "belief_time": forecast_belief_time - pd.Timedelta(minutes=30), + pipeline.target: None, + regressor_a: 5.0, + regressor_b: None, + }, + { + "event_start": pd.Timestamp("2025-01-08T10:00:00"), + "belief_time": forecast_belief_time - pd.Timedelta(minutes=15), + pipeline.target: None, + regressor_a: None, + regressor_b: 7.0, + }, + { + "event_start": pd.Timestamp("2025-01-08T10:00:00"), + "belief_time": forecast_belief_time - pd.Timedelta(minutes=10), + pipeline.target: None, + regressor_a: None, + regressor_b: 8.0, + }, + { + "event_start": pd.Timestamp("2025-01-08T10:00:00"), + "belief_time": forecast_belief_time + pd.Timedelta(minutes=5), + pipeline.target: None, + regressor_a: 50.0, + regressor_b: 80.0, + }, + ] + ) + + captured_future_frames = [] + + # Capture the covariate frame before missing-value filling converts it + # to a Darts TimeSeries. This keeps the test focused on in-memory belief + # selection instead of requiring database-backed sensor data. + def capture_frame(self, df, sensors, sensor_names, start, end, **kwargs): + if sensor_names == self.future_regressors: + captured_future_frames.append(df.copy()) + return df + + monkeypatch.setattr(BasePipeline, "detect_and_fill_missing_values", capture_frame) + + pipeline.split_data_all_beliefs(df) + + assert len(captured_future_frames) == 1, ( + "Expected one future-covariate frame because this one-step pipeline " + "prepares exactly one split." + ) + selected = captured_future_frames[0].set_index("event_start") + assert selected.loc[pd.Timestamp("2025-01-08T09:00:00"), regressor_a] == 4.0, ( + "Expected regressor A's latest known realized value for the historical " + "event, because it has the latest non-null belief by forecast time." + ) + assert selected.loc[pd.Timestamp("2025-01-08T10:00:00"), regressor_a] == 5.0, ( + "Expected regressor A's available forecast value to survive even though " + "regressor B has a later belief on a different joined row." + ) + assert selected.loc[pd.Timestamp("2025-01-08T10:00:00"), regressor_b] == 8.0, ( + "Expected regressor B's latest known forecast value, because selection " + "happens independently per regressor." + ) + assert 50.0 not in set(selected[regressor_a].dropna()), ( + "Expected regressor A's future belief recorded after the forecast " + "belief_time to be excluded." + ) + assert 80.0 not in set(selected[regressor_b].dropna()), ( + "Expected regressor B's future belief recorded after the forecast " + "belief_time to be excluded." + ) + + +def test_past_regressor_split_selects_latest_known_value_per_regressor(monkeypatch): + target_sensor = type( + "SensorStub", + (), + {"name": "target", "id": 1, "event_resolution": timedelta(hours=1)}, + )() + past_regressor_a = type( + "SensorStub", + (), + {"name": "meter-a", "id": 2, "event_resolution": timedelta(hours=1)}, + )() + past_regressor_b = type( + "SensorStub", + (), + {"name": "meter-b", "id": 3, "event_resolution": timedelta(hours=1)}, + )() + + pipeline = BasePipeline( + target_sensor=target_sensor, + future_regressors=[], + past_regressors=[past_regressor_a, past_regressor_b], + n_steps_to_predict=1, + max_forecast_horizon=1, + forecast_frequency=1, + event_starts_after=datetime(2025, 1, 8, 9), + event_ends_before=datetime(2025, 1, 8, 10), + ) + forecast_belief_time = pd.Timestamp("2025-01-08T10:00:00") + regressor_a, regressor_b = pipeline.past_regressors + + df = pd.DataFrame( + [ + { + "event_start": pd.Timestamp("2025-01-08T09:00:00"), + "belief_time": forecast_belief_time - pd.Timedelta(minutes=30), + pipeline.target: None, + regressor_a: 5.0, + regressor_b: None, + }, + { + "event_start": pd.Timestamp("2025-01-08T09:00:00"), + "belief_time": forecast_belief_time - pd.Timedelta(minutes=15), + pipeline.target: None, + regressor_a: None, + regressor_b: 7.0, + }, + { + "event_start": pd.Timestamp("2025-01-08T09:00:00"), + "belief_time": forecast_belief_time, + pipeline.target: 1.0, + regressor_a: None, + regressor_b: None, + }, + ] + ) + + captured_past_frames = [] + + # Capture the covariate frame before missing-value filling converts it + # to a Darts TimeSeries. This keeps the test focused on in-memory belief + # selection instead of requiring database-backed sensor data. + def capture_frame(self, df, sensors, sensor_names, start, end, **kwargs): + if sensor_names == self.past_regressors: + captured_past_frames.append(df.copy()) + return df + + monkeypatch.setattr(BasePipeline, "detect_and_fill_missing_values", capture_frame) + + pipeline.split_data_all_beliefs(df) + + assert len(captured_past_frames) == 1, ( + "Expected one past-covariate frame because this one-step pipeline " + "prepares exactly one split." + ) + selected = captured_past_frames[0].set_index("event_start") + assert selected.loc[pd.Timestamp("2025-01-08T09:00:00"), regressor_a] == 5.0, ( + "Expected past regressor A's value to survive even though past " + "regressor B is known on a different joined row." + ) + assert selected.loc[pd.Timestamp("2025-01-08T09:00:00"), regressor_b] == 7.0, ( + "Expected past regressor B's value to survive because selection happens " + "independently per regressor." + ) + + def test_future_regressor_splits_use_only_beliefs_known_at_forecast_belief_time( monkeypatch, ): @@ -771,6 +974,133 @@ def capture_frame(self, df, sensors, sensor_names, start, end, **kwargs): assert 77.0 not in set(values_by_event) +def test_realized_future_regressors_use_latest_known_per_regressor_per_step( + monkeypatch, +): + target_sensor = type( + "SensorStub", + (), + {"name": "target", "id": 1, "event_resolution": timedelta(hours=1)}, + )() + future_regressor_a = type( + "SensorStub", + (), + {"name": "weather-a", "id": 2, "event_resolution": timedelta(hours=1)}, + )() + future_regressor_b = type( + "SensorStub", + (), + {"name": "weather-b", "id": 3, "event_resolution": timedelta(hours=1)}, + )() + + pipeline = BasePipeline( + target_sensor=target_sensor, + future_regressors=[future_regressor_a, future_regressor_b], + past_regressors=[], + n_steps_to_predict=2, + max_forecast_horizon=1, + forecast_frequency=1, + event_starts_after=datetime(2025, 1, 8, 9), + event_ends_before=datetime(2025, 1, 8, 11), + predict_start=datetime(2025, 1, 8, 10), + predict_end=datetime(2025, 1, 8, 12), + ) + regressor_a, regressor_b = pipeline.future_regressors + + df = pd.DataFrame( + [ + { + "event_start": pd.Timestamp("2025-01-08T09:00:00"), + "belief_time": pd.Timestamp("2025-01-08T09:10:00"), + pipeline.target: None, + regressor_a: 1.0, + regressor_b: None, + }, + { + "event_start": pd.Timestamp("2025-01-08T09:00:00"), + "belief_time": pd.Timestamp("2025-01-08T09:40:00"), + pipeline.target: None, + regressor_a: None, + regressor_b: 20.0, + }, + { + "event_start": pd.Timestamp("2025-01-08T09:00:00"), + "belief_time": pd.Timestamp("2025-01-08T09:50:00"), + pipeline.target: None, + regressor_a: 2.0, + regressor_b: None, + }, + { + "event_start": pd.Timestamp("2025-01-08T09:00:00"), + "belief_time": pd.Timestamp("2025-01-08T10:00:00"), + pipeline.target: 1.0, + regressor_a: None, + regressor_b: None, + }, + { + "event_start": pd.Timestamp("2025-01-08T09:00:00"), + "belief_time": pd.Timestamp("2025-01-08T10:30:00"), + pipeline.target: None, + regressor_a: 3.0, + regressor_b: None, + }, + { + "event_start": pd.Timestamp("2025-01-08T09:00:00"), + "belief_time": pd.Timestamp("2025-01-08T10:45:00"), + pipeline.target: None, + regressor_a: None, + regressor_b: 30.0, + }, + { + "event_start": pd.Timestamp("2025-01-08T10:00:00"), + "belief_time": pd.Timestamp("2025-01-08T11:00:00"), + pipeline.target: 2.0, + regressor_a: None, + regressor_b: None, + }, + ] + ) + + captured_future_frames = [] + + # Capture the covariate frame before missing-value filling converts it + # to a Darts TimeSeries. This keeps the test focused on in-memory belief + # selection instead of requiring database-backed sensor data. + def capture_frame(self, df, sensors, sensor_names, start, end, **kwargs): + if sensor_names == self.future_regressors: + captured_future_frames.append(df.copy()) + return df + + monkeypatch.setattr(BasePipeline, "detect_and_fill_missing_values", capture_frame) + + pipeline.split_data_all_beliefs(df, is_predict_pipeline=True) + + assert len(captured_future_frames) == 2, ( + "Expected two future-covariate frames because the predict pipeline " + "simulates two forecast belief_time steps." + ) + first_step = captured_future_frames[0].set_index("event_start") + second_step = captured_future_frames[1].set_index("event_start") + event_start = pd.Timestamp("2025-01-08T09:00:00") + + assert first_step.loc[event_start, regressor_a] == 2.0, ( + "Expected the first forecast step to use regressor A's latest realized " + "belief known by 10:00, not the older 09:10 belief." + ) + assert first_step.loc[event_start, regressor_b] == 20.0, ( + "Expected the first forecast step to exclude regressor B's 10:45 belief " + "because it is not known yet at 10:00." + ) + assert second_step.loc[event_start, regressor_a] == 3.0, ( + "Expected the second forecast step to use regressor A's 10:30 belief " + "because it is known by 11:00." + ) + assert second_step.loc[event_start, regressor_b] == 30.0, ( + "Expected the second forecast step to use regressor B's 10:45 belief " + "because it is known by 11:00." + ) + + def test_future_regressor_changes_forecasts_in_forecast_belief_time_window( app, fresh_db, tmp_path ):