Skip to content

Commit 69fc9b0

Browse files
author
HeerakKashyap
committed
feat: add synthetic data generation module and CLI command (closes #285)
1 parent 24ad29a commit 69fc9b0

File tree

1 file changed

+94
-0
lines changed

1 file changed

+94
-0
lines changed

synthetic_data.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
"""
2+
Synthetic Data Generation Utilities
3+
4+
- Tabular data generation
5+
- Time series data generation
6+
- Data with specific distributions
7+
"""
8+
9+
import numpy as np
10+
import pandas as pd
11+
from datetime import datetime, timedelta
12+
13+
def generate_tabular_data(n_samples=1000, n_features=10, target_column=True):
14+
"""
15+
Generate synthetic tabular data for classification/regression.
16+
17+
Args:
18+
n_samples: Number of samples to generate
19+
n_features: Number of features
20+
target_column: Whether to include a target column
21+
22+
Returns:
23+
DataFrame with synthetic data
24+
"""
25+
# Generate features
26+
features = np.random.randn(n_samples, n_features)
27+
feature_names = [f'feature_{i}' for i in range(n_features)]
28+
29+
# Create DataFrame
30+
df = pd.DataFrame(features, columns=feature_names)
31+
32+
# Add target column if requested
33+
if target_column:
34+
# Simple target based on first feature
35+
df['target'] = (df['feature_0'] > 0).astype(int)
36+
37+
return df
38+
39+
def generate_time_series_data(n_samples=1000, trend=True, seasonality=True):
40+
"""
41+
Generate synthetic time series data.
42+
43+
Args:
44+
n_samples: Number of time points
45+
trend: Whether to add trend
46+
seasonality: Whether to add seasonality
47+
48+
Returns:
49+
DataFrame with time series data
50+
"""
51+
# Base time series
52+
time_index = pd.date_range(start='2020-01-01', periods=n_samples, freq='D')
53+
54+
# Generate base signal
55+
signal = np.random.randn(n_samples) * 0.1
56+
57+
# Add trend
58+
if trend:
59+
trend_component = np.linspace(0, 2, n_samples)
60+
signal += trend_component
61+
62+
# Add seasonality
63+
if seasonality:
64+
seasonal_component = 0.5 * np.sin(2 * np.pi * np.arange(n_samples) / 365)
65+
signal += seasonal_component
66+
67+
# Create DataFrame
68+
df = pd.DataFrame({
69+
'date': time_index,
70+
'value': signal
71+
})
72+
73+
return df
74+
75+
def generate_categorical_data(n_samples=1000, n_categories=5):
76+
"""
77+
Generate synthetic categorical data.
78+
79+
Args:
80+
n_samples: Number of samples
81+
n_categories: Number of categories
82+
83+
Returns:
84+
DataFrame with categorical data
85+
"""
86+
categories = [f'category_{i}' for i in range(n_categories)]
87+
data = np.random.choice(categories, size=n_samples)
88+
89+
df = pd.DataFrame({
90+
'category': data,
91+
'value': np.random.randn(n_samples)
92+
})
93+
94+
return df

0 commit comments

Comments
 (0)