-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_preprocessing.py
More file actions
41 lines (31 loc) · 1.53 KB
/
data_preprocessing.py
File metadata and controls
41 lines (31 loc) · 1.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import joblib
# Load the dataset
df = pd.read_csv("housing.csv")
# Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns
# Initialize imputer for numerical data (using mean strategy)
numerical_imputer = SimpleImputer(strategy='mean')
df[numerical_cols] = numerical_imputer.fit_transform(df[numerical_cols])
# Initialize imputer for categorical data (using mode strategy)
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])
# Encode categorical data (one-hot encoding for 'ocean_proximity')
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
# Separate features (X) and target (y)
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Save the preprocessed data and the scaler
joblib.dump((X_train_scaled, X_test_scaled, y_train, y_test), 'prepared_data.pkl')
joblib.dump(scaler, 'scaler.pkl') # ⬅️ این خط اضافه شد
print("Data preprocessing completed and saved.")