@@ -294,3 +294,210 @@ def get_target_category_columns_map(self):
294
294
def _fill_na (self , df : pd .DataFrame , na_value = 0 ) -> pd .DataFrame :
295
295
"""Fill nans in dataframe"""
296
296
return df .fillna (value = na_value )
297
+
298
+ def build_fforms_meta_features (self , data , target_col = None , group_cols = None ):
299
+ """
300
+ Build meta-features for time series based on FFORMS paper and add them to the original DataFrame.
301
+
302
+ Parameters
303
+ ----------
304
+ data : pandas.DataFrame
305
+ Input DataFrame containing time series data
306
+ target_col : str, optional
307
+ Name of the target column to calculate meta-features for.
308
+ If None, uses the target column specified in dataset_info.
309
+ group_cols : list of str, optional
310
+ List of columns to group by before calculating meta-features.
311
+ If None, calculates features for the entire series.
312
+
313
+ Returns
314
+ -------
315
+ pandas.DataFrame
316
+ Original DataFrame with additional meta-feature columns
317
+
318
+ References
319
+ ----------
320
+ Talagala, T. S., Hyndman, R. J., & Athanasopoulos, G. (2023).
321
+ Meta-learning how to forecast time series. Journal of Forecasting, 42(6), 1476-1501.
322
+ """
323
+ if not isinstance (data , pd .DataFrame ):
324
+ raise ValueError ("Input must be a pandas DataFrame" )
325
+
326
+ # Use target column from dataset_info if not specified
327
+ if target_col is None :
328
+ target_col = self .target_column_name
329
+ if target_col not in data .columns :
330
+ raise ValueError (f"Target column '{ target_col } ' not found in DataFrame" )
331
+
332
+ # Check if group_cols are provided and valid
333
+ if group_cols is not None :
334
+ if not isinstance (group_cols , list ):
335
+ raise ValueError ("group_cols must be a list of column names" )
336
+ for col in group_cols :
337
+ if col not in data .columns :
338
+ raise ValueError (f"Group column '{ col } ' not found in DataFrame" )
339
+
340
+ # If no group_cols, get the target_category_columns else treat the entire DataFrame as a single series
341
+ if not group_cols :
342
+ group_cols = self .target_category_columns if self .target_category_columns else []
343
+
344
+ # Calculate meta-features for each series
345
+ def calculate_series_features (series ):
346
+ """Calculate features for a single series"""
347
+ n = len (series )
348
+ values = series .values
349
+
350
+ # Basic statistics
351
+ mean = series .mean ()
352
+ std = series .std ()
353
+ variance = series .var ()
354
+ skewness = series .skew ()
355
+ kurtosis = series .kurtosis ()
356
+ cv = std / mean if mean != 0 else np .inf
357
+
358
+ # Trend features
359
+ X = np .vstack ([np .arange (n ), np .ones (n )]).T
360
+ trend_coef = np .linalg .lstsq (X , values , rcond = None )[0 ][0 ]
361
+ trend_pred = X .dot (np .linalg .lstsq (X , values , rcond = None )[0 ])
362
+ residuals = values - trend_pred
363
+ std_residuals = np .std (residuals )
364
+
365
+ # Turning points
366
+ turning_points = 0
367
+ for i in range (1 , n - 1 ):
368
+ if (values [i - 1 ] < values [i ] and values [i ] > values [i + 1 ]) or \
369
+ (values [i - 1 ] > values [i ] and values [i ] < values [i + 1 ]):
370
+ turning_points += 1
371
+ turning_points_rate = turning_points / (n - 2 ) if n > 2 else 0
372
+
373
+ # Serial correlation
374
+ acf1 = series .autocorr (lag = 1 ) if n > 1 else 0
375
+ acf2 = series .autocorr (lag = 2 ) if n > 2 else 0
376
+ acf10 = series .autocorr (lag = 10 ) if n > 10 else 0
377
+
378
+ # Seasonality features
379
+ seasonal_strength = 0
380
+ seasonal_peak_strength = 0
381
+ if n >= 12 :
382
+ seasonal_lags = [12 , 24 , 36 ]
383
+ seasonal_acfs = []
384
+ for lag in seasonal_lags :
385
+ if n > lag :
386
+ acf_val = series .autocorr (lag = lag )
387
+ seasonal_acfs .append (abs (acf_val ))
388
+ seasonal_peak_strength = max (seasonal_acfs ) if seasonal_acfs else 0
389
+
390
+ ma = series .rolling (window = 12 , center = True ).mean ()
391
+ seasonal_comp = series - ma
392
+ seasonal_strength = 1 - np .var (seasonal_comp .dropna ()) / np .var (series )
393
+
394
+ # Stability and volatility features
395
+ values_above_mean = values >= mean
396
+ crossing_points = np .sum (values_above_mean [1 :] != values_above_mean [:- 1 ])
397
+ crossing_rate = crossing_points / (n - 1 ) if n > 1 else 0
398
+
399
+ # First and second differences
400
+ diff1 = np .diff (values )
401
+ diff2 = np .diff (diff1 ) if len (diff1 ) > 1 else np .array ([])
402
+
403
+ diff1_mean = np .mean (np .abs (diff1 )) if len (diff1 ) > 0 else 0
404
+ diff1_var = np .var (diff1 ) if len (diff1 ) > 0 else 0
405
+ diff2_mean = np .mean (np .abs (diff2 )) if len (diff2 ) > 0 else 0
406
+ diff2_var = np .var (diff2 ) if len (diff2 ) > 0 else 0
407
+
408
+ # Nonlinearity features
409
+ if n > 3 :
410
+ X = values [:- 1 ].reshape (- 1 , 1 )
411
+ y = values [1 :]
412
+ X2 = X * X
413
+ X3 = X * X * X
414
+ X_aug = np .hstack ([X , X2 , X3 ])
415
+ nonlinearity = np .linalg .lstsq (X_aug , y , rcond = None )[1 ][0 ] if len (y ) > 0 else 0
416
+ else :
417
+ nonlinearity = 0
418
+
419
+ # Long-term trend features
420
+ if n >= 10 :
421
+ mid = n // 2
422
+ trend_change = np .mean (values [mid :]) - np .mean (values [:mid ])
423
+ else :
424
+ trend_change = 0
425
+
426
+ # Step changes and spikes
427
+ step_changes = np .abs (diff1 ).max () if len (diff1 ) > 0 else 0
428
+ spikes = np .sum (np .abs (values - mean ) > 2 * std ) / n if std != 0 else 0
429
+
430
+ # Hurst exponent and entropy
431
+ lag = min (10 , n // 2 )
432
+ variance_ratio = np .var (series .diff (lag )) / (lag * np .var (series .diff ())) if n > lag else 0
433
+ hurst = np .log (variance_ratio ) / (2 * np .log (lag )) if variance_ratio > 0 and lag > 1 else 0
434
+
435
+ hist , _ = np .histogram (series , bins = 'auto' , density = True )
436
+ entropy = - np .sum (hist [hist > 0 ] * np .log (hist [hist > 0 ]))
437
+
438
+ return pd .Series ({
439
+ 'ts_n_obs' : n ,
440
+ 'ts_mean' : mean ,
441
+ 'ts_std' : std ,
442
+ 'ts_variance' : variance ,
443
+ 'ts_cv' : cv ,
444
+ 'ts_skewness' : skewness ,
445
+ 'ts_kurtosis' : kurtosis ,
446
+ 'ts_trend' : trend_coef ,
447
+ 'ts_trend_change' : trend_change ,
448
+ 'ts_std_residuals' : std_residuals ,
449
+ 'ts_turning_points_rate' : turning_points_rate ,
450
+ 'ts_seasonal_strength' : seasonal_strength ,
451
+ 'ts_seasonal_peak_strength' : seasonal_peak_strength ,
452
+ 'ts_acf1' : acf1 ,
453
+ 'ts_acf2' : acf2 ,
454
+ 'ts_acf10' : acf10 ,
455
+ 'ts_crossing_rate' : crossing_rate ,
456
+ 'ts_diff1_mean' : diff1_mean ,
457
+ 'ts_diff1_variance' : diff1_var ,
458
+ 'ts_diff2_mean' : diff2_mean ,
459
+ 'ts_diff2_variance' : diff2_var ,
460
+ 'ts_nonlinearity' : nonlinearity ,
461
+ 'ts_step_max' : step_changes ,
462
+ 'ts_spikes_rate' : spikes ,
463
+ 'ts_hurst' : hurst ,
464
+ 'ts_entropy' : entropy
465
+ })
466
+
467
+ # Create copy of input DataFrame
468
+ result_df = data .copy ()
469
+
470
+ if group_cols :
471
+ # Calculate features for each group
472
+ features = []
473
+ # Sort by date within each group if date column exists
474
+ date_col = self .dt_column_name if self .dt_column_name else 'Date'
475
+ if date_col in data .columns :
476
+ data = data .sort_values ([date_col ] + group_cols )
477
+
478
+ for name , group in data .groupby (group_cols ):
479
+ # Sort group by date if exists
480
+ if date_col in group .columns :
481
+ group = group .sort_values (date_col )
482
+ group_features = calculate_series_features (group [target_col ])
483
+ if isinstance (name , tuple ):
484
+ feature_row = dict (zip (group_cols , name ))
485
+ else :
486
+ feature_row = {group_cols [0 ]: name }
487
+ feature_row .update (group_features )
488
+ features .append (feature_row )
489
+
490
+ # Create features DataFrame without merging
491
+ features_df = pd .DataFrame (features )
492
+ # Return only the meta-features DataFrame with group columns
493
+ return features_df
494
+ else :
495
+ # Sort by date if exists and calculate features for entire series
496
+ date_col = self .dt_column_name if self .dt_column_name else 'Date'
497
+ if date_col in data .columns :
498
+ data = data .sort_values (date_col )
499
+ features = calculate_series_features (data [target_col ])
500
+ # Return single row DataFrame with meta-features
501
+ return pd .DataFrame ([features ])
502
+
503
+ return result_df
0 commit comments