-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpca.py
More file actions
128 lines (98 loc) · 3.61 KB
/
pca.py
File metadata and controls
128 lines (98 loc) · 3.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
class PCA:
"""
Principal Component Analysis (PCA) class.
Parameters:
- n_components (int): Number of principal components to retain.
Attributes:
- n_components (int): Number of principal components to retain.
- components (ndarray or None): Principal components obtained after fitting.
- mean (ndarray or None): Mean of the input data used for standardization.
"""
def __init__(self, n_components: int):
"""
Initialize PCA with the specified number of components.
Parameters:
- n_components (int): Number of principal components to retain.
"""
self.n_components = n_components
self.components = None
self.mean = None
self.explained_variance = 0
def _standardize(self, X: np.ndarray) -> np.ndarray:
"""
Standardize the input data.
Parameters:
- X (ndarray): Input data.
Returns:
- ndarray: Standardized data.
"""
return X - self.mean
def _swap_signs(self, M: np.ndarray) -> np.ndarray:
"""
Swap signs of the columns of the matrix based on the column with maximum absolute value.
Parameters:
- M (ndarray): Input matrix.
Returns:
- ndarray: Matrix with signs swapped.
"""
max_abs_cols = np.argmax(np.abs(M), axis=0)
signs = np.sign(M[max_abs_cols, range(M.shape[1])])
M *= signs
return M
def fit(self, X: np.ndarray) -> None:
"""
Fit the PCA model to the input data.
Parameters:
- X (ndarray): Input data.
"""
self.mean = np.mean(X, axis=0)
X_standardized = self._standardize(X)
S = X_standardized.T @ X_standardized / (X.shape[0] - 1)
eigenvalues, eigenvectors = np.linalg.eig(S)
# Sort eigenvectors by eigenvalues in descending order
sorted_idxs = np.argsort(eigenvalues)[::-1]
self.components = self._swap_signs(eigenvectors[:, sorted_idxs[:self.n_components]])
total_variance = np.sum(eigenvalues)
self.explained_variance = np.sum(eigenvalues[sorted_idxs[:self.n_components]])/total_variance
def transform(self, X: np.ndarray) -> np.ndarray:
"""
Transform the input data using the fitted PCA model.
Parameters:
- X (ndarray): Input data.
Returns:
- ndarray: Transformed data.
"""
if self.components is None:
raise Exception("Call fit method first!")
X_standardized = self._standardize(X)
return np.dot(X_standardized, self.components)
def fit_transform(self, X: np.ndarray) -> np.ndarray:
"""
Fit the PCA model to the input data and transform it.
Parameters:
- X (ndarray): Input data.
Returns:
- ndarray: Transformed data.
"""
self.fit(X)
return self.transform(X)
if __name__ == '__main__':
# Testing the implementation
X = np.array([
[1.5, 2.0, 3.5, 4.2, 5.8],
[2.0, 3.2, 4.5, 5.7, 6.2],
[3.2, 4.5, 5.1, 6.0, 7.5],
[4.1, 5.6, 6.9, 7.2, 8.1],
[5.3, 6.4, 7.7, 8.4, 9.2],
[6.2, 7.1, 8.4, 9.3, 10.0],
[7.5, 8.3, 9.6, 10.2, 11.0],
[8.8, 9.6, 10.9, 11.5, 12.2],
[9.9, 10.8, 11.7, 12.3, 13.0],
[10.5, 11.7, 12.9, 13.6, 14.5]
]).astype(float)
# Msing sklearn PCA for comparison
pca = PCA(n_components=2)
my_result = pca.fit_transform(X)
reconstructed = my_result @ pca.components.T + pca.mean
print(f'Mean absolute error on reconstruction: {np.mean(np.abs(X - reconstructed)):3.2f}')