1+ """
2+ Synthetic Data Generation Utilities
3+
4+ - Tabular data generation
5+ - Time series data generation
6+ - Data with specific distributions
7+ """
8+
9+ import numpy as np
10+ import pandas as pd
11+ from datetime import datetime , timedelta
12+
13+ def generate_tabular_data (n_samples = 1000 , n_features = 10 , target_column = True ):
14+ """
15+ Generate synthetic tabular data for classification/regression.
16+
17+ Args:
18+ n_samples: Number of samples to generate
19+ n_features: Number of features
20+ target_column: Whether to include a target column
21+
22+ Returns:
23+ DataFrame with synthetic data
24+ """
25+ # Generate features
26+ features = np .random .randn (n_samples , n_features )
27+ feature_names = [f'feature_{ i } ' for i in range (n_features )]
28+
29+ # Create DataFrame
30+ df = pd .DataFrame (features , columns = feature_names )
31+
32+ # Add target column if requested
33+ if target_column :
34+ # Simple target based on first feature
35+ df ['target' ] = (df ['feature_0' ] > 0 ).astype (int )
36+
37+ return df
38+
39+ def generate_time_series_data (n_samples = 1000 , trend = True , seasonality = True ):
40+ """
41+ Generate synthetic time series data.
42+
43+ Args:
44+ n_samples: Number of time points
45+ trend: Whether to add trend
46+ seasonality: Whether to add seasonality
47+
48+ Returns:
49+ DataFrame with time series data
50+ """
51+ # Base time series
52+ time_index = pd .date_range (start = '2020-01-01' , periods = n_samples , freq = 'D' )
53+
54+ # Generate base signal
55+ signal = np .random .randn (n_samples ) * 0.1
56+
57+ # Add trend
58+ if trend :
59+ trend_component = np .linspace (0 , 2 , n_samples )
60+ signal += trend_component
61+
62+ # Add seasonality
63+ if seasonality :
64+ seasonal_component = 0.5 * np .sin (2 * np .pi * np .arange (n_samples ) / 365 )
65+ signal += seasonal_component
66+
67+ # Create DataFrame
68+ df = pd .DataFrame ({
69+ 'date' : time_index ,
70+ 'value' : signal
71+ })
72+
73+ return df
74+
75+ def generate_categorical_data (n_samples = 1000 , n_categories = 5 ):
76+ """
77+ Generate synthetic categorical data.
78+
79+ Args:
80+ n_samples: Number of samples
81+ n_categories: Number of categories
82+
83+ Returns:
84+ DataFrame with categorical data
85+ """
86+ categories = [f'category_{ i } ' for i in range (n_categories )]
87+ data = np .random .choice (categories , size = n_samples )
88+
89+ df = pd .DataFrame ({
90+ 'category' : data ,
91+ 'value' : np .random .randn (n_samples )
92+ })
93+
94+ return df
0 commit comments