ML-Algorithms-From-Scratch/mllib/pca.py at main · insdout/ML-Algorithms-From-Scratch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np


class PCA:
    """
    Principal Component Analysis (PCA) class.

    Parameters:
    - n_components (int): Number of principal components to retain.

    Attributes:
    - n_components (int): Number of principal components to retain.
    - components (ndarray or None): Principal components obtained after fitting.
    - mean (ndarray or None): Mean of the input data used for standardization.
    """

    def __init__(self, n_components: int):
        """
        Initialize PCA with the specified number of components.

        Parameters:
        - n_components (int): Number of principal components to retain.
        """
        self.n_components = n_components
        self.components = None
        self.mean = None
        self.explained_variance = 0

    def _standardize(self, X: np.ndarray) -> np.ndarray:
        """
        Standardize the input data.

        Parameters:
        - X (ndarray): Input data.

        Returns:
        - ndarray: Standardized data.
        """
        return X - self.mean

    def _swap_signs(self, M: np.ndarray) -> np.ndarray:
        """
        Swap signs of the columns of the matrix based on the column with maximum absolute value.

        Parameters:
        - M (ndarray): Input matrix.

        Returns:
        - ndarray: Matrix with signs swapped.
        """
        max_abs_cols = np.argmax(np.abs(M), axis=0)
        signs = np.sign(M[max_abs_cols, range(M.shape[1])])
        M *= signs
        return M

    def fit(self, X: np.ndarray) -> None:
        """
        Fit the PCA model to the input data.

        Parameters:
        - X (ndarray): Input data.
        """
        self.mean = np.mean(X, axis=0)
        X_standardized = self._standardize(X)

        S = X_standardized.T @ X_standardized / (X.shape[0] - 1)

        eigenvalues, eigenvectors = np.linalg.eig(S)

        # Sort eigenvectors by eigenvalues in descending order
        sorted_idxs = np.argsort(eigenvalues)[::-1]
        self.components = self._swap_signs(eigenvectors[:, sorted_idxs[:self.n_components]])
        total_variance = np.sum(eigenvalues)
        self.explained_variance = np.sum(eigenvalues[sorted_idxs[:self.n_components]])/total_variance

    def transform(self, X: np.ndarray) -> np.ndarray:
        """
        Transform the input data using the fitted PCA model.

        Parameters:
        - X (ndarray): Input data.

        Returns:
        - ndarray: Transformed data.
        """
        if self.components is None:
            raise Exception("Call fit method first!")

        X_standardized = self._standardize(X)
        return np.dot(X_standardized, self.components)

    def fit_transform(self, X: np.ndarray) -> np.ndarray:
        """
        Fit the PCA model to the input data and transform it.

        Parameters:
        - X (ndarray): Input data.

        Returns:
        - ndarray: Transformed data.
        """
        self.fit(X)
        return self.transform(X)


if __name__ == '__main__':

    # Testing the implementation
    X = np.array([
        [1.5, 2.0, 3.5, 4.2, 5.8],
        [2.0, 3.2, 4.5, 5.7, 6.2],
        [3.2, 4.5, 5.1, 6.0, 7.5],
        [4.1, 5.6, 6.9, 7.2, 8.1],
        [5.3, 6.4, 7.7, 8.4, 9.2],
        [6.2, 7.1, 8.4, 9.3, 10.0],
        [7.5, 8.3, 9.6, 10.2, 11.0],
        [8.8, 9.6, 10.9, 11.5, 12.2],
        [9.9, 10.8, 11.7, 12.3, 13.0],
        [10.5, 11.7, 12.9, 13.6, 14.5]
    ]).astype(float)

    # Msing sklearn PCA for comparison
    pca = PCA(n_components=2)
    my_result = pca.fit_transform(X)
    reconstructed = my_result @ pca.components.T + pca.mean
    print(f'Mean absolute error on reconstruction: {np.mean(np.abs(X - reconstructed)):3.2f}')