From bd700607819cfd1222d42282b50a2235c50e545a Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Sun, 16 Oct 2022 23:52:54 -0700
Subject: [PATCH 01/31] Initial Implementation

---
 tests/unit/svm/__init__.py        |  0
 tests/unit/svm/linear_svc_test.py | 34 +++++++++++++
 torchml/svm/__init__.py           |  1 +
 torchml/svm/linear_svc.py         | 84 +++++++++++++++++++++++++++++++
 4 files changed, 119 insertions(+)
 create mode 100644 tests/unit/svm/__init__.py
 create mode 100644 tests/unit/svm/linear_svc_test.py
 create mode 100644 torchml/svm/__init__.py
 create mode 100644 torchml/svm/linear_svc.py

diff --git a/tests/unit/svm/__init__.py b/tests/unit/svm/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
new file mode 100644
index 0000000..c1caf02
--- /dev/null
+++ b/tests/unit/svm/linear_svc_test.py
@@ -0,0 +1,34 @@
+import unittest
+import numpy as np
+import torch
+from sklearn.datasets import make_classification
+import sklearn.svm as svm
+import time
+
+import torchml as ml
+from torchml.svm import LinearSVC
+
+BSZ = 128
+DIM = 5
+
+
+class TestLinearSVC(unittest.TestCase):
+    def test_coef(self):
+        x, y = make_classification(n_samples=500, n_features=10,
+                                   n_classes=2)
+        lsvc = LinearSVC(verbose=0)
+        start = time.time()
+        lsvc.fit(torch.from_numpy(x), torch.from_numpy(y))
+        end = time.time()
+        print(end - start)
+        print("Here")
+        start = time.time()
+        reflsvc = svm.LinearSVC()
+        reflsvc.fit(x, y)
+        end = time.time()
+        print(end - start)
+        print("Here")
+        self.assertTrue(np.allclose(lsvc.coef_.numpy(), reflsvc.coef_, atol=0.03))
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchml/svm/__init__.py b/torchml/svm/__init__.py
new file mode 100644
index 0000000..c799ee5
--- /dev/null
+++ b/torchml/svm/__init__.py
@@ -0,0 +1 @@
+from .linear_svc import LinearSVC
diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
new file mode 100644
index 0000000..e4f06f9
--- /dev/null
+++ b/torchml/svm/linear_svc.py
@@ -0,0 +1,84 @@
+import torch
+from sklearn.datasets import make_classification
+
+import torchml as ml
+import cvxpy as cp
+from cvxpylayers.torch import CvxpyLayer
+
+from sklearn import svm
+
+
+class LinearSVC(ml.Model):
+
+    def __init__(
+            self,
+            penalty="l2",
+            loss="squared_hinge",
+            *,
+            dual=True,
+            tol=1e-4,
+            C=1.0,
+            multi_class="ovr",
+            fit_intercept=True,
+            intercept_scaling=1,
+            class_weight=None,
+            verbose=0,
+            random_state=None,
+            max_iter=1000,
+    ):
+        super(LinearSVC, self).__init__()
+        self.coef_ = None
+        self.intercept_ = None
+        self.classes_ = None
+        self.y_ = None
+        self.dual = dual
+        self.tol = tol
+        self.C = C
+        self.multi_class = multi_class
+        self.fit_intercept = fit_intercept
+        self.intercept_scaling = intercept_scaling
+        self.class_weight = class_weight
+        self.verbose = verbose
+        self.random_state = random_state
+        self.max_iter = max_iter
+        self.penalty = penalty
+        self.loss = loss
+
+    def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
+        if self.C < 0:
+            raise ValueError(
+                "Penalty term must be positive; got (C=%r)" % self.C)
+        self.classes_ = torch.unique(y)
+        self.y_ = y
+        assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
+
+        m, n = X.shape
+
+        w = cp.Variable(n)
+        if self.fit_intercept:
+            b = cp.Variable()
+
+        obj = 0
+        for i in range(m):
+            if y[i] == self.classes_[1]:
+                yi = 1
+            else:
+                yi = -1
+            if self.fit_intercept:
+                obj += cp.square(cp.pos(1 - yi * (w.T @ X[i] + b)))
+            else:
+                obj += cp.sqaure(cp.pos(1 - yi * (w.T @ X[i])))
+
+        obj *= self.C
+        obj += cp.multiply((1 / 2.0), cp.norm(w, 2))
+
+        prob = cp.Problem(cp.Minimize(obj), [])
+        prob.solve()
+        self.coef_, self.intercept_ = torch.from_numpy(w.value), torch.from_numpy(b.value)
+        # if self.fit_intercept:
+        #     fit_lr = CvxpyLayer(prob, [], [w, b])
+        # else:
+        #     fit_lr = CvxpyLayer(prob, [], [w])
+        #
+        # self.weight, self.intercept = fit_lr()
+        return self

From 252d64148a6acd6b6bf1d0e732c684efb6a76c79 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Tue, 18 Oct 2022 03:04:22 -0700
Subject: [PATCH 02/31] add tutorials for KNN

---
 docs/tutorials/neighbors.md | 79 +++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 docs/tutorials/neighbors.md

diff --git a/docs/tutorials/neighbors.md b/docs/tutorials/neighbors.md
new file mode 100644
index 0000000..0535bc4
--- /dev/null
+++ b/docs/tutorials/neighbors.md
@@ -0,0 +1,79 @@
+# Nearest Neighbors
+`torchml.neighbors` currently supports Unsupervised learnings on classification problem. It currently supports K Nearest Neighbors classification with `torchml.neighbors.NearestNeighbors` that implement `sklearn.neighbors.NearestNeighbors`'s brute force solution with TorchML.
+
+## K Nearest Neighbors classification
+The principle behind Nearest Neighbors algorithms is, given a distance function and a new test point $x$, the algorithm find k closest samples in the known sample set, and use them to estimate the $x$. The number $k$ can be user-defined and tuned according to the particular problem. The distance function can be any arbitrary metric function, and standard Euclidean distance is the most common choice.
+
+One important thing about this algorithm is that its not based on any probabilistic framework, but the algorithm is able to estimate probability for each class given a test point $x$ and its k neighbors.
+
+Given a dataset with $n$ samples and $b$ distinct classes, and a new point $x$ we wish to classify: 
+$\{x_i, y_i\}, i=1,2....n, y_i \in \{c_1, c_2, c_3... c_b\}$
+
+We calculate the number of samples that fall into a class for all classes:
+$\{n_a, a=1,2,3...b\}, \Sigma_{a=1}^{b}n_a = n$
+
+We first find the $k$ nearest neighbors of $x$:
+$\{x_j, y_j\}, i=1,2....k, y_j \in \{c_1, c_2, c_3... c_c\}$
+
+We then count the number of points in the $k$ neighbors that are in the class $c$:
+$\{nk_a, a=1,2,3...b\}, \Sigma_{a=1}^{b}nk_a = k$
+
+The probability that $x$ is of class $c_c$ is simply:
+$P(c_c | x)= {nk_c\over k}$
+
+This estimation is often accurate in practice, even though the algorithm is not built with probability in mind. 
+
+## KNN from a bayesian stand point
+Even though the KNN algorithm is not built on top of probabilistic framework, we can gain intuition behind its shockingly good estimation by framing it in the bayesian framework.
+
+What we want is:
+$P(c_c | x), c=1,2,3...b$
+and in bayesian terms, what we need is:
+$P(c_c | x) = {{P(x | c_c)*P(c_c)} \over {P(x)}}$
+Given nothing but our samples, $P(c_c)$, or the prior, is simply $n_c \over n$
+
+$P(x)$ is the probabilistic density of random variable $x$, and we need to borrow some knowledge from density estimation for this analysis:
+
+Since we don't know $P(x)$, we need to conduct discrete trials on $P(x)$. Suppose that the density $P(x)$ lies in a D-Dimensional space, and we assume it to be Euclidean. We conduct trials in this space by drawing $n$ points on it according to $P(x)$ (these $n$ points are our samples). By principle of locality, for a given point $x_t$ we've drawn on the space, we can assume that the density have some correlations with points in the small space surrounding it. Let's draw a small sphere around the point, and name the space in the sphere $R$.
+
+The total probability that a test point can end up inside $R$ is the sum of probability that a point can be in a point in $R$ over all the small points in $R$, or the probability mass of $P(x)$ in $R$:
+$P_{in R} = {\int_{R} P(x)dx}$
+
+For the $n$ samples we gathered, each sample has a probability $P_{in R}$ of being inside $R$, then the total number of $k$ points that successfully end up in $R$ can be modeled using binomial distribution:
+$Bin(k|n,P_{in R}) = {n! \over {k!(n-k)!}}{P_{in R}^k}{(1-P_{in R})}^{n-k}$
+
+We also have:
+$E(k) = n*P_{in R}$
+$P_{in R} = {{E(k)} \over n}$
+
+For our algorithm we supply the parameter $k$, so we can just sub in our well-chosen $k$ instead of the expectation, which gives us:
+$k \approx n*P_{in R}$
+$P_{in R} \approx {k \over n}$
+
+We further assume that $R$ is quite small, thus $P(x)$ changes very little inside $R$, and we assume $P(x)$ to follow a uniform distribution, then we can derive that:
+$P_{in R} \approx P(x)V$ Where $V$ is the volume of $R$.
+
+Then our final estimation of $P(x)$ will be:
+$P(x) = {{k}\over{nV}}$
+
+We repeat the process for a specific class $c_c$, and we will get:
+$P(x|c_c) = {{nk_c}\over{n_c V}}$
+
+substitute both $P(x|c_c) = {{nk_c}\over{n_c V}}$ and $P(x) = {{k}\over{nV}}$ into our bayesian, we will get:
+$P(c_c | x)= {nk_c\over k}$
+
+## Sources
+* Christopher M. Bishop. 2006. Pattern Recognition and Machine Learning (Information Science and Statistics). Springer-Verlag, Berlin, Heidelberg.
+* [MIT Lecture on KNN](https://youtu.be/09mb78oiPkA)
+
+ 
+
+
+
+
+
+
+
+
+
+

From fd1304776ead9e9845edd2aae02bf24f567f39f3 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Tue, 18 Oct 2022 03:08:28 -0700
Subject: [PATCH 03/31] Add link for sources

---
 docs/tutorials/neighbors.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorials/neighbors.md b/docs/tutorials/neighbors.md
index 0535bc4..c4e3e88 100644
--- a/docs/tutorials/neighbors.md
+++ b/docs/tutorials/neighbors.md
@@ -63,7 +63,7 @@ substitute both $P(x|c_c) = {{nk_c}\over{n_c V}}$ and $P(x) = {{k}\over{nV}}$ in
 $P(c_c | x)= {nk_c\over k}$
 
 ## Sources
-* Christopher M. Bishop. 2006. Pattern Recognition and Machine Learning (Information Science and Statistics). Springer-Verlag, Berlin, Heidelberg.
+* [Christopher M. Bishop. 2006. Pattern Recognition and Machine Learning (Information Science and Statistics). Springer-Verlag, Berlin, Heidelberg.](https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf)
 * [MIT Lecture on KNN](https://youtu.be/09mb78oiPkA)
 
  

From d2930838252ef42f8de5bd7baf15814a7737ebb7 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Tue, 18 Oct 2022 10:47:53 -0700
Subject: [PATCH 04/31] remove doc

---
 docs/tutorials/neighbors.md | 79 -------------------------------------
 torchml/svm/linear_svc.py   | 47 +++++++++++-----------
 2 files changed, 24 insertions(+), 102 deletions(-)
 delete mode 100644 docs/tutorials/neighbors.md

diff --git a/docs/tutorials/neighbors.md b/docs/tutorials/neighbors.md
deleted file mode 100644
index c4e3e88..0000000
--- a/docs/tutorials/neighbors.md
+++ /dev/null
@@ -1,79 +0,0 @@
-# Nearest Neighbors
-`torchml.neighbors` currently supports Unsupervised learnings on classification problem. It currently supports K Nearest Neighbors classification with `torchml.neighbors.NearestNeighbors` that implement `sklearn.neighbors.NearestNeighbors`'s brute force solution with TorchML.
-
-## K Nearest Neighbors classification
-The principle behind Nearest Neighbors algorithms is, given a distance function and a new test point $x$, the algorithm find k closest samples in the known sample set, and use them to estimate the $x$. The number $k$ can be user-defined and tuned according to the particular problem. The distance function can be any arbitrary metric function, and standard Euclidean distance is the most common choice.
-
-One important thing about this algorithm is that its not based on any probabilistic framework, but the algorithm is able to estimate probability for each class given a test point $x$ and its k neighbors.
-
-Given a dataset with $n$ samples and $b$ distinct classes, and a new point $x$ we wish to classify: 
-$\{x_i, y_i\}, i=1,2....n, y_i \in \{c_1, c_2, c_3... c_b\}$
-
-We calculate the number of samples that fall into a class for all classes:
-$\{n_a, a=1,2,3...b\}, \Sigma_{a=1}^{b}n_a = n$
-
-We first find the $k$ nearest neighbors of $x$:
-$\{x_j, y_j\}, i=1,2....k, y_j \in \{c_1, c_2, c_3... c_c\}$
-
-We then count the number of points in the $k$ neighbors that are in the class $c$:
-$\{nk_a, a=1,2,3...b\}, \Sigma_{a=1}^{b}nk_a = k$
-
-The probability that $x$ is of class $c_c$ is simply:
-$P(c_c | x)= {nk_c\over k}$
-
-This estimation is often accurate in practice, even though the algorithm is not built with probability in mind. 
-
-## KNN from a bayesian stand point
-Even though the KNN algorithm is not built on top of probabilistic framework, we can gain intuition behind its shockingly good estimation by framing it in the bayesian framework.
-
-What we want is:
-$P(c_c | x), c=1,2,3...b$
-and in bayesian terms, what we need is:
-$P(c_c | x) = {{P(x | c_c)*P(c_c)} \over {P(x)}}$
-Given nothing but our samples, $P(c_c)$, or the prior, is simply $n_c \over n$
-
-$P(x)$ is the probabilistic density of random variable $x$, and we need to borrow some knowledge from density estimation for this analysis:
-
-Since we don't know $P(x)$, we need to conduct discrete trials on $P(x)$. Suppose that the density $P(x)$ lies in a D-Dimensional space, and we assume it to be Euclidean. We conduct trials in this space by drawing $n$ points on it according to $P(x)$ (these $n$ points are our samples). By principle of locality, for a given point $x_t$ we've drawn on the space, we can assume that the density have some correlations with points in the small space surrounding it. Let's draw a small sphere around the point, and name the space in the sphere $R$.
-
-The total probability that a test point can end up inside $R$ is the sum of probability that a point can be in a point in $R$ over all the small points in $R$, or the probability mass of $P(x)$ in $R$:
-$P_{in R} = {\int_{R} P(x)dx}$
-
-For the $n$ samples we gathered, each sample has a probability $P_{in R}$ of being inside $R$, then the total number of $k$ points that successfully end up in $R$ can be modeled using binomial distribution:
-$Bin(k|n,P_{in R}) = {n! \over {k!(n-k)!}}{P_{in R}^k}{(1-P_{in R})}^{n-k}$
-
-We also have:
-$E(k) = n*P_{in R}$
-$P_{in R} = {{E(k)} \over n}$
-
-For our algorithm we supply the parameter $k$, so we can just sub in our well-chosen $k$ instead of the expectation, which gives us:
-$k \approx n*P_{in R}$
-$P_{in R} \approx {k \over n}$
-
-We further assume that $R$ is quite small, thus $P(x)$ changes very little inside $R$, and we assume $P(x)$ to follow a uniform distribution, then we can derive that:
-$P_{in R} \approx P(x)V$ Where $V$ is the volume of $R$.
-
-Then our final estimation of $P(x)$ will be:
-$P(x) = {{k}\over{nV}}$
-
-We repeat the process for a specific class $c_c$, and we will get:
-$P(x|c_c) = {{nk_c}\over{n_c V}}$
-
-substitute both $P(x|c_c) = {{nk_c}\over{n_c V}}$ and $P(x) = {{k}\over{nV}}$ into our bayesian, we will get:
-$P(c_c | x)= {nk_c\over k}$
-
-## Sources
-* [Christopher M. Bishop. 2006. Pattern Recognition and Machine Learning (Information Science and Statistics). Springer-Verlag, Berlin, Heidelberg.](https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf)
-* [MIT Lecture on KNN](https://youtu.be/09mb78oiPkA)
-
- 
-
-
-
-
-
-
-
-
-
-
diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index e4f06f9..5a1b274 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -52,33 +52,34 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         self.y_ = y
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
 
-        m, n = X.shape
-
-        w = cp.Variable(n)
-        if self.fit_intercept:
-            b = cp.Variable()
-
-        obj = 0
-        for i in range(m):
-            if y[i] == self.classes_[1]:
-                yi = 1
-            else:
-                yi = -1
-            if self.fit_intercept:
-                obj += cp.square(cp.pos(1 - yi * (w.T @ X[i] + b)))
-            else:
-                obj += cp.sqaure(cp.pos(1 - yi * (w.T @ X[i])))
-
-        obj *= self.C
-        obj += cp.multiply((1 / 2.0), cp.norm(w, 2))
-
-        prob = cp.Problem(cp.Minimize(obj), [])
-        prob.solve()
-        self.coef_, self.intercept_ = torch.from_numpy(w.value), torch.from_numpy(b.value)
+        # m, n = X.shape
+        #
+        # w = cp.Variable(n)
+        # if self.fit_intercept:
+        #     b = cp.Variable()
+        #
+        # obj = 0
+        # for i in range(m):
+        #     if y[i] == self.classes_[1]:
+        #         yi = 1
+        #     else:
+        #         yi = -1
+        #     if self.fit_intercept:
+        #         obj += cp.square(cp.pos(1 - yi * (w.T @ X[i] + b)))
+        #     else:
+        #         obj += cp.sqaure(cp.pos(1 - yi * (w.T @ X[i])))
+        #
+        # obj *= self.C
+        # obj += cp.multiply((1 / 2.0), cp.norm(w, 2))
+        #
+        # prob = cp.Problem(cp.Minimize(obj), [])
+        # prob.solve()
+        # self.coef_, self.intercept_ = torch.from_numpy(w.value), torch.from_numpy(b.value)
         # if self.fit_intercept:
         #     fit_lr = CvxpyLayer(prob, [], [w, b])
         # else:
         #     fit_lr = CvxpyLayer(prob, [], [w])
         #
         # self.weight, self.intercept = fit_lr()
+
         return self

From dcd2a1a44abcc5be01959cfe4778706383904c8a Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Tue, 18 Oct 2022 12:10:54 -0700
Subject: [PATCH 05/31] vectorized LinearSVC

---
 tests/unit/svm/linear_svc_test.py |  4 +--
 torchml/svm/linear_svc.py         | 57 +++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index c1caf02..1e5a787 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -14,20 +14,18 @@
 
 class TestLinearSVC(unittest.TestCase):
     def test_coef(self):
-        x, y = make_classification(n_samples=500, n_features=10,
+        x, y = make_classification(n_samples=50000, n_features=10,
                                    n_classes=2)
         lsvc = LinearSVC(verbose=0)
         start = time.time()
         lsvc.fit(torch.from_numpy(x), torch.from_numpy(y))
         end = time.time()
         print(end - start)
-        print("Here")
         start = time.time()
         reflsvc = svm.LinearSVC()
         reflsvc.fit(x, y)
         end = time.time()
         print(end - start)
-        print("Here")
         self.assertTrue(np.allclose(lsvc.coef_.numpy(), reflsvc.coef_, atol=0.03))
 
 if __name__ == "__main__":
diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index 5a1b274..21bd4e3 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -81,5 +81,62 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         #     fit_lr = CvxpyLayer(prob, [], [w])
         #
         # self.weight, self.intercept = fit_lr()
+        y = torch.unsqueeze(y, 1)
 
+        y = (y == self.classes_[1]).float()
+        y *= 2
+        y -= 1
+
+        m, n = X.shape
+
+        w = cp.Variable((n, 1))
+        if self.fit_intercept:
+            b = cp.Variable()
+        X_param = cp.Parameter((m, n))
+        y_param = cp.Parameter((m, 1))
+        C_param = cp.Parameter(nonneg=True)
+        ones = torch.ones((m, 1))
+
+        # set up objective
+        if self.fit_intercept:
+            loss = cp.multiply((1 / 2.0),
+                               cp.norm(w, 2)) + C_param * cp.sum(cp.square(cp.pos(ones -
+                                                                                   cp.multiply(y_param,
+                                                                                               X_param @ w + b))))
+        else:
+            loss = (1 / (2 * m)) * cp.sum(cp.square(X_param @ w - y_param))
+
+        objective = loss
+
+        # set up constraints
+        constraints = []
+
+        prob = cp.Problem(cp.Minimize(objective), constraints)
+        X_param.value = X.numpy()
+        y_param.value = y.numpy()
+        C_param.value = self.C
+        prob.solve(solver='ECOS', abstol=self.tol, max_iters=self.max_iter)
+
+        # convert into pytorch layer
+        # if self.fit_intercept:
+        #     fit_lr = CvxpyLayer(prob, [X_param, y_param, C_param], [w, b])
+        # else:
+        #     fit_lr = CvxpyLayer(prob, [X_param, y_param, C_param], [w])
+
+        # process input data
+        # if self.require_grad:
+        #     X.requires_grad_(True)
+        #     y.requires_grad_(True)
+
+        # this object is now callable with pytorch tensors
+
+        # if self.fit_intercept:
+        #     self.weight, self.intercept = fit_lr(
+        #         X, y, self.C
+        #     )
+        # else:
+        #     self.weight = fit_lr(X, y, torch.tensor(
+        #         self.alpha, dtype=torch.float64))
+        self.coef_, self.intercept_ = torch.from_numpy(w.value), torch.from_numpy(b.value)
+        self.coef_ = torch.t(self.coef_)
         return self

From 7117e9f24d4184cb31aa594fd5567298dfccf7cc Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Tue, 18 Oct 2022 12:20:15 -0700
Subject: [PATCH 06/31] support hinge loss

---
 torchml/svm/linear_svc.py | 55 ++++++++++++++-------------------------
 1 file changed, 19 insertions(+), 36 deletions(-)

diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index 21bd4e3..d897090 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -52,38 +52,9 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         self.y_ = y
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
 
-        # m, n = X.shape
-        #
-        # w = cp.Variable(n)
-        # if self.fit_intercept:
-        #     b = cp.Variable()
-        #
-        # obj = 0
-        # for i in range(m):
-        #     if y[i] == self.classes_[1]:
-        #         yi = 1
-        #     else:
-        #         yi = -1
-        #     if self.fit_intercept:
-        #         obj += cp.square(cp.pos(1 - yi * (w.T @ X[i] + b)))
-        #     else:
-        #         obj += cp.sqaure(cp.pos(1 - yi * (w.T @ X[i])))
-        #
-        # obj *= self.C
-        # obj += cp.multiply((1 / 2.0), cp.norm(w, 2))
-        #
-        # prob = cp.Problem(cp.Minimize(obj), [])
-        # prob.solve()
-        # self.coef_, self.intercept_ = torch.from_numpy(w.value), torch.from_numpy(b.value)
-        # if self.fit_intercept:
-        #     fit_lr = CvxpyLayer(prob, [], [w, b])
-        # else:
-        #     fit_lr = CvxpyLayer(prob, [], [w])
-        #
-        # self.weight, self.intercept = fit_lr()
         y = torch.unsqueeze(y, 1)
 
-        y = (y == self.classes_[1]).float()
+        y = (y != self.classes_[0]).float()
         y *= 2
         y -= 1
 
@@ -97,15 +68,27 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         C_param = cp.Parameter(nonneg=True)
         ones = torch.ones((m, 1))
 
+        loss = cp.multiply((1 / 2.0), cp.norm(w, 2))
+
         # set up objective
         if self.fit_intercept:
-            loss = cp.multiply((1 / 2.0),
-                               cp.norm(w, 2)) + C_param * cp.sum(cp.square(cp.pos(ones -
-                                                                                   cp.multiply(y_param,
-                                                                                               X_param @ w + b))))
+            if self.loss == "squared_hinge":
+                loss += C_param * cp.sum(cp.square(cp.pos(ones -
+                                                          cp.multiply(y_param,
+                                                                      X_param @ w + b))))
+            elif self.loss == "hinge":
+                loss += C_param * cp.sum(cp.pos(ones -
+                                                cp.multiply(y_param,
+                                                            X_param @ w + b)))
         else:
-            loss = (1 / (2 * m)) * cp.sum(cp.square(X_param @ w - y_param))
-
+            if self.loss == "squared_hinge":
+                loss += C_param * cp.sum(cp.square(cp.pos(ones -
+                                                          cp.multiply(y_param,
+                                                                      X_param @ w))))
+            elif self.loss == "hinge":
+                loss += C_param * cp.sum(cp.pos(ones -
+                                                cp.multiply(y_param,
+                                                            X_param @ w)))
         objective = loss
 
         # set up constraints

From c0c52839f3afd70b7dd9b7f761eaf8373a61a00f Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Tue, 18 Oct 2022 18:13:36 -0700
Subject: [PATCH 07/31] fix format

---
 tests/unit/svm/linear_svc_test.py     | 24 +++++--
 torchml/neighbors/nearest_centroid.py |  3 +-
 torchml/svm/linear_svc.py             | 96 ++++++++++-----------------
 3 files changed, 52 insertions(+), 71 deletions(-)

diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 1e5a787..d110bb7 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -8,25 +8,35 @@
 import torchml as ml
 from torchml.svm import LinearSVC
 
-BSZ = 128
-DIM = 5
+n_samples = 5000
+n_features = 10
+n_classes = 2
+n_informative = 8
 
 
 class TestLinearSVC(unittest.TestCase):
-    def test_coef(self):
-        x, y = make_classification(n_samples=50000, n_features=10,
-                                   n_classes=2)
-        lsvc = LinearSVC(verbose=0)
+    def test_simple(self):
+        x, y = make_classification(
+            n_samples=n_samples,
+            n_features=n_features,
+            n_classes=n_classes,
+            n_informative=n_informative,
+        )
+        lsvc = LinearSVC(max_iter=1000)
         start = time.time()
         lsvc.fit(torch.from_numpy(x), torch.from_numpy(y))
         end = time.time()
         print(end - start)
         start = time.time()
-        reflsvc = svm.LinearSVC()
+        reflsvc = svm.LinearSVC(max_iter=1000)
         reflsvc.fit(x, y)
         end = time.time()
         print(end - start)
         self.assertTrue(np.allclose(lsvc.coef_.numpy(), reflsvc.coef_, atol=0.03))
+        self.assertTrue(
+            np.allclose(lsvc.intercept_.numpy(), reflsvc.intercept_, atol=0.03)
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchml/neighbors/nearest_centroid.py b/torchml/neighbors/nearest_centroid.py
index 96dbf7b..d6d7ae3 100644
--- a/torchml/neighbors/nearest_centroid.py
+++ b/torchml/neighbors/nearest_centroid.py
@@ -116,8 +116,7 @@ def predict(self, X: torch.tensor) -> torch.tensor:
 
         for i in range(X.size(dim=0)):
             ret[i] = self.classes_[
-                torch.argmin(torch.nn.PairwiseDistance(p=2)
-                             (X[i], self.centroids_))
+                torch.argmin(torch.nn.PairwiseDistance(p=2)(X[i], self.centroids_))
             ]
 
         # return ret.to(self.y_type)
diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index d897090..43d9657 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -9,28 +9,26 @@
 
 
 class LinearSVC(ml.Model):
-
     def __init__(
-            self,
-            penalty="l2",
-            loss="squared_hinge",
-            *,
-            dual=True,
-            tol=1e-4,
-            C=1.0,
-            multi_class="ovr",
-            fit_intercept=True,
-            intercept_scaling=1,
-            class_weight=None,
-            verbose=0,
-            random_state=None,
-            max_iter=1000,
+        self,
+        penalty="l2",
+        loss="squared_hinge",
+        *,
+        dual=True,
+        tol=1e-4,
+        C=1.0,
+        multi_class="ovr",
+        fit_intercept=True,
+        intercept_scaling=1,
+        class_weight=None,
+        verbose=0,
+        random_state=None,
+        max_iter=1000,
     ):
         super(LinearSVC, self).__init__()
         self.coef_ = None
         self.intercept_ = None
         self.classes_ = None
-        self.y_ = None
         self.dual = dual
         self.tol = tol
         self.C = C
@@ -46,20 +44,21 @@ def __init__(
 
     def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         if self.C < 0:
-            raise ValueError(
-                "Penalty term must be positive; got (C=%r)" % self.C)
+            raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
         self.classes_ = torch.unique(y)
-        self.y_ = y
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
 
+        m, n = X.shape
+
+        self.coef_ = torch.empty((0, n))
+        self.intercept_ = torch.empty((0))
+
         y = torch.unsqueeze(y, 1)
 
         y = (y != self.classes_[0]).float()
         y *= 2
         y -= 1
 
-        m, n = X.shape
-
         w = cp.Variable((n, 1))
         if self.fit_intercept:
             b = cp.Variable()
@@ -70,25 +69,16 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
 
         loss = cp.multiply((1 / 2.0), cp.norm(w, 2))
 
-        # set up objective
         if self.fit_intercept:
-            if self.loss == "squared_hinge":
-                loss += C_param * cp.sum(cp.square(cp.pos(ones -
-                                                          cp.multiply(y_param,
-                                                                      X_param @ w + b))))
-            elif self.loss == "hinge":
-                loss += C_param * cp.sum(cp.pos(ones -
-                                                cp.multiply(y_param,
-                                                            X_param @ w + b)))
+            hinge = cp.pos(ones - cp.multiply(y_param, X_param @ w + b))
         else:
-            if self.loss == "squared_hinge":
-                loss += C_param * cp.sum(cp.square(cp.pos(ones -
-                                                          cp.multiply(y_param,
-                                                                      X_param @ w))))
-            elif self.loss == "hinge":
-                loss += C_param * cp.sum(cp.pos(ones -
-                                                cp.multiply(y_param,
-                                                            X_param @ w)))
+            hinge = cp.pos(ones - cp.multiply(y_param, X_param @ w))
+
+        if self.loss == "squared_hinge":
+            loss += C_param * cp.sum(cp.square(hinge))
+        elif self.loss == "hinge":
+            loss += C_param * cp.sum(hinge)
+
         objective = loss
 
         # set up constraints
@@ -98,28 +88,10 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         X_param.value = X.numpy()
         y_param.value = y.numpy()
         C_param.value = self.C
-        prob.solve(solver='ECOS', abstol=self.tol, max_iters=self.max_iter)
-
-        # convert into pytorch layer
-        # if self.fit_intercept:
-        #     fit_lr = CvxpyLayer(prob, [X_param, y_param, C_param], [w, b])
-        # else:
-        #     fit_lr = CvxpyLayer(prob, [X_param, y_param, C_param], [w])
-
-        # process input data
-        # if self.require_grad:
-        #     X.requires_grad_(True)
-        #     y.requires_grad_(True)
-
-        # this object is now callable with pytorch tensors
-
-        # if self.fit_intercept:
-        #     self.weight, self.intercept = fit_lr(
-        #         X, y, self.C
-        #     )
-        # else:
-        #     self.weight = fit_lr(X, y, torch.tensor(
-        #         self.alpha, dtype=torch.float64))
-        self.coef_, self.intercept_ = torch.from_numpy(w.value), torch.from_numpy(b.value)
-        self.coef_ = torch.t(self.coef_)
+        prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
+
+        self.coef_ = torch.cat((self.coef_, torch.t(torch.from_numpy(w.value))))
+        self.intercept_ = torch.cat(
+            (self.intercept_, torch.unsqueeze(torch.from_numpy(b.value), 0))
+        )
         return self

From 65bbe31ac748506ca7b6aee77b97b33b88aa93a1 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Fri, 21 Oct 2022 17:25:57 -0700
Subject: [PATCH 08/31] add support for multiclass

---
 tests/unit/svm/linear_svc_test.py | 19 +++++------
 torchml/svm/linear_svc.py         | 52 +++++++++++++++++--------------
 2 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index d110bb7..2a082df 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -5,36 +5,37 @@
 import sklearn.svm as svm
 import time
 
-import torchml as ml
 from torchml.svm import LinearSVC
 
 n_samples = 5000
 n_features = 10
-n_classes = 2
-n_informative = 8
+n_classes = 5
+n_informative = 10
 
 
 class TestLinearSVC(unittest.TestCase):
-    def test_simple(self):
+    def test_LinearSVC(self):
         x, y = make_classification(
             n_samples=n_samples,
             n_features=n_features,
             n_classes=n_classes,
             n_informative=n_informative,
+            n_redundant=n_features-n_informative
         )
         lsvc = LinearSVC(max_iter=1000)
         start = time.time()
         lsvc.fit(torch.from_numpy(x), torch.from_numpy(y))
         end = time.time()
-        print(end - start)
+        # print(end - start)
         start = time.time()
-        reflsvc = svm.LinearSVC(max_iter=1000)
+        reflsvc = svm.LinearSVC(max_iter=100000)
         reflsvc.fit(x, y)
         end = time.time()
-        print(end - start)
-        self.assertTrue(np.allclose(lsvc.coef_.numpy(), reflsvc.coef_, atol=0.03))
+        # print(end - start)
+        self.assertTrue(np.allclose(
+            lsvc.coef_.numpy(), reflsvc.coef_, atol=1e-2))
         self.assertTrue(
-            np.allclose(lsvc.intercept_.numpy(), reflsvc.intercept_, atol=0.03)
+            np.allclose(lsvc.intercept_.numpy(), reflsvc.intercept_, atol=1e-2)
         )
 
 
diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index 43d9657..2da5511 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -1,29 +1,25 @@
 import torch
-from sklearn.datasets import make_classification
 
 import torchml as ml
 import cvxpy as cp
-from cvxpylayers.torch import CvxpyLayer
-
-from sklearn import svm
 
 
 class LinearSVC(ml.Model):
     def __init__(
-        self,
-        penalty="l2",
-        loss="squared_hinge",
-        *,
-        dual=True,
-        tol=1e-4,
-        C=1.0,
-        multi_class="ovr",
-        fit_intercept=True,
-        intercept_scaling=1,
-        class_weight=None,
-        verbose=0,
-        random_state=None,
-        max_iter=1000,
+            self,
+            penalty="l2",
+            loss="squared_hinge",
+            *,
+            dual=True,
+            tol=1e-4,
+            C=1.0,
+            multi_class="ovr",
+            fit_intercept=True,
+            intercept_scaling=1,
+            class_weight=None,
+            verbose=0,
+            random_state=None,
+            max_iter=1000,
     ):
         super(LinearSVC, self).__init__()
         self.coef_ = None
@@ -47,15 +43,22 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
             raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
         self.classes_ = torch.unique(y)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
-
         m, n = X.shape
-
         self.coef_ = torch.empty((0, n))
         self.intercept_ = torch.empty((0))
+        if self.classes_.shape[0] == 2:
+            self.fit_with_one_class_(X, y, self.classes_[1], sample_weight=sample_weight)
+        else:
+            for i, x in enumerate(self.classes_):
+                self.fit_with_one_class_(X, y, x, sample_weight=sample_weight)
+
+    def fit_with_one_class_(self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None):
+
+        m, n = X.shape
 
         y = torch.unsqueeze(y, 1)
 
-        y = (y != self.classes_[0]).float()
+        y = (y == fitting_class).float()
         y *= 2
         y -= 1
 
@@ -91,7 +94,8 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
 
         self.coef_ = torch.cat((self.coef_, torch.t(torch.from_numpy(w.value))))
-        self.intercept_ = torch.cat(
-            (self.intercept_, torch.unsqueeze(torch.from_numpy(b.value), 0))
-        )
+        if self.fit_intercept:
+            self.intercept_ = torch.cat(
+                (self.intercept_, torch.unsqueeze(torch.from_numpy(b.value), 0))
+            )
         return self

From 97872c467cda0e90b99cbe4a653453765366c780 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Tue, 25 Oct 2022 02:27:39 -0700
Subject: [PATCH 09/31] change n_informative

---
 tests/unit/svm/linear_svc_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 2a082df..7abed19 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -10,7 +10,7 @@
 n_samples = 5000
 n_features = 10
 n_classes = 5
-n_informative = 10
+n_informative = 7
 
 
 class TestLinearSVC(unittest.TestCase):

From e1ba7d2ff0bfb7088f8a1f14551e7882a332474c Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Tue, 25 Oct 2022 02:51:47 -0700
Subject: [PATCH 10/31] implemented predict and decision function

---
 tests/unit/svm/linear_svc_test.py |  9 +++++++-
 torchml/svm/linear_svc.py         | 37 ++++++++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 7abed19..2b4296f 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -20,7 +20,7 @@ def test_LinearSVC(self):
             n_features=n_features,
             n_classes=n_classes,
             n_informative=n_informative,
-            n_redundant=n_features-n_informative
+            n_redundant=n_features - n_informative
         )
         lsvc = LinearSVC(max_iter=1000)
         start = time.time()
@@ -30,6 +30,7 @@ def test_LinearSVC(self):
         start = time.time()
         reflsvc = svm.LinearSVC(max_iter=100000)
         reflsvc.fit(x, y)
+
         end = time.time()
         # print(end - start)
         self.assertTrue(np.allclose(
@@ -37,6 +38,12 @@ def test_LinearSVC(self):
         self.assertTrue(
             np.allclose(lsvc.intercept_.numpy(), reflsvc.intercept_, atol=1e-2)
         )
+        self.assertTrue(
+            np.allclose(lsvc.decision_function(torch.from_numpy(x)), reflsvc.decision_function(x), atol=1e-2)
+        )
+        self.assertTrue(
+            np.allclose(lsvc.predict(torch.from_numpy(x)), reflsvc.predict(x), atol=1e-2)
+        )
 
 
 if __name__ == "__main__":
diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index 2da5511..a7c6ebd 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -40,19 +40,45 @@ def __init__(
 
     def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         if self.C < 0:
-            raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
+            raise ValueError(
+                "Penalty term must be positive; got (C=%r)" % self.C)
         self.classes_ = torch.unique(y)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
         m, n = X.shape
         self.coef_ = torch.empty((0, n))
         self.intercept_ = torch.empty((0))
         if self.classes_.shape[0] == 2:
-            self.fit_with_one_class_(X, y, self.classes_[1], sample_weight=sample_weight)
+            self._fit_with_one_class(
+                X, y, self.classes_[1], sample_weight=sample_weight)
         else:
             for i, x in enumerate(self.classes_):
-                self.fit_with_one_class_(X, y, x, sample_weight=sample_weight)
+                self._fit_with_one_class(X, y, x, sample_weight=sample_weight)
+
+    def decision_function(self, X : torch.Tensor) -> torch.Tensor:
+        return X @ self.coef_.T + self.intercept_
+
+    def predict(self, X: torch.Tensor) -> torch.Tensor:
+        """
+        Predict class labels for samples in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data matrix for which we want to get the predictions.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            Vector containing the class labels for each sample.
+        """
+        scores = self.decision_function(X)
+        if len(scores.shape) == 1:
+            indices = (scores > 0).int()
+        else:
+            indices = scores.argmax(dim=1)
+        return self.classes_[indices]
 
-    def fit_with_one_class_(self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None):
+    def _fit_with_one_class(self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None):
 
         m, n = X.shape
 
@@ -93,7 +119,8 @@ def fit_with_one_class_(self, X: torch.Tensor, y: torch.Tensor, fitting_class: a
         C_param.value = self.C
         prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
 
-        self.coef_ = torch.cat((self.coef_, torch.t(torch.from_numpy(w.value))))
+        self.coef_ = torch.cat(
+            (self.coef_, torch.t(torch.from_numpy(w.value))))
         if self.fit_intercept:
             self.intercept_ = torch.cat(
                 (self.intercept_, torch.unsqueeze(torch.from_numpy(b.value), 0))

From 8193a94f4a08c1b02c20057e95ca8968f68ed501 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Thu, 27 Oct 2022 22:09:52 -0700
Subject: [PATCH 11/31] add skeleton

---
 torchml/svm/__init__.py   |   1 +
 torchml/svm/linear_svr.py | 128 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)
 create mode 100644 torchml/svm/linear_svr.py

diff --git a/torchml/svm/__init__.py b/torchml/svm/__init__.py
index c799ee5..09ebc60 100644
--- a/torchml/svm/__init__.py
+++ b/torchml/svm/__init__.py
@@ -1 +1,2 @@
 from .linear_svc import LinearSVC
+from .linear_svr import LinearSVR
diff --git a/torchml/svm/linear_svr.py b/torchml/svm/linear_svr.py
new file mode 100644
index 0000000..3c7cc07
--- /dev/null
+++ b/torchml/svm/linear_svr.py
@@ -0,0 +1,128 @@
+import torch
+
+import torchml as ml
+import cvxpy as cp
+
+
+class LinearSVR(ml.Model):
+    def __init__(
+            self,
+            penalty="l2",
+            loss="squared_hinge",
+            *,
+            dual=True,
+            tol=1e-4,
+            C=1.0,
+            multi_class="ovr",
+            fit_intercept=True,
+            intercept_scaling=1,
+            class_weight=None,
+            verbose=0,
+            random_state=None,
+            max_iter=1000,
+    ):
+        super(LinearSVC, self).__init__()
+        self.coef_ = None
+        self.intercept_ = None
+        self.classes_ = None
+        self.dual = dual
+        self.tol = tol
+        self.C = C
+        self.multi_class = multi_class
+        self.fit_intercept = fit_intercept
+        self.intercept_scaling = intercept_scaling
+        self.class_weight = class_weight
+        self.verbose = verbose
+        self.random_state = random_state
+        self.max_iter = max_iter
+        self.penalty = penalty
+        self.loss = loss
+
+    def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
+        if self.C < 0:
+            raise ValueError(
+                "Penalty term must be positive; got (C=%r)" % self.C)
+        self.classes_ = torch.unique(y)
+        assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
+        m, n = X.shape
+        self.coef_ = torch.empty((0, n))
+        self.intercept_ = torch.empty((0))
+        if self.classes_.shape[0] == 2:
+            self._fit_with_one_class(
+                X, y, self.classes_[1], sample_weight=sample_weight)
+        else:
+            for i, x in enumerate(self.classes_):
+                self._fit_with_one_class(X, y, x, sample_weight=sample_weight)
+
+    def decision_function(self, X: torch.Tensor) -> torch.Tensor:
+        return X @ self.coef_.T + self.intercept_
+
+    def predict(self, X: torch.Tensor) -> torch.Tensor:
+        """
+        Predict class labels for samples in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data matrix for which we want to get the predictions.
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            Vector containing the class labels for each sample.
+        """
+        scores = self.decision_function(X)
+        if len(scores.shape) == 1:
+            indices = (scores > 0).int()
+        else:
+            indices = scores.argmax(dim=1)
+        return self.classes_[indices]
+
+    def _fit_with_one_class(self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None):
+
+        m, n = X.shape
+
+        y = torch.unsqueeze(y, 1)
+
+        y = (y == fitting_class).float()
+        y *= 2
+        y -= 1
+
+        w = cp.Variable((n, 1))
+        if self.fit_intercept:
+            b = cp.Variable()
+        X_param = cp.Parameter((m, n))
+        y_param = cp.Parameter((m, 1))
+        C_param = cp.Parameter(nonneg=True)
+        ones = torch.ones((m, 1))
+
+        loss = cp.multiply((1 / 2.0), cp.norm(w, 2))
+
+        if self.fit_intercept:
+            hinge = cp.pos(ones - cp.multiply(y_param, X_param @ w + b))
+        else:
+            hinge = cp.pos(ones - cp.multiply(y_param, X_param @ w))
+
+        if self.loss == "squared_hinge":
+            loss += C_param * cp.sum(cp.square(hinge))
+        elif self.loss == "hinge":
+            loss += C_param * cp.sum(hinge)
+
+        objective = loss
+
+        # set up constraints
+        constraints = []
+
+        prob = cp.Problem(cp.Minimize(objective), constraints)
+        X_param.value = X.numpy()
+        y_param.value = y.numpy()
+        C_param.value = self.C
+        prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
+
+        self.coef_ = torch.cat(
+            (self.coef_, torch.t(torch.from_numpy(w.value))))
+        if self.fit_intercept:
+            self.intercept_ = torch.cat(
+                (self.intercept_, torch.unsqueeze(torch.from_numpy(b.value), 0))
+            )
+        return self

From cbd5d01ebe4fadfbf7704a97c7039797a162e44d Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Thu, 27 Oct 2022 22:51:26 -0700
Subject: [PATCH 12/31] implemented svr

---
 tests/unit/svm/linear_svr_test.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/unit/svm/linear_svr_test.py

diff --git a/tests/unit/svm/linear_svr_test.py b/tests/unit/svm/linear_svr_test.py
new file mode 100644
index 0000000..e69de29

From 9eefcdb1ce29788728ac5252ee19cf00130cad86 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Thu, 27 Oct 2022 22:51:31 -0700
Subject: [PATCH 13/31] implemented svr

---
 tests/unit/svm/linear_svc_test.py |  15 +++--
 tests/unit/svm/linear_svr_test.py |  45 +++++++++++++
 torchml/svm/linear_svc.py         |  43 +++++++------
 torchml/svm/linear_svr.py         | 102 +++++++++---------------------
 4 files changed, 106 insertions(+), 99 deletions(-)

diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 2b4296f..2f1826f 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -20,7 +20,7 @@ def test_LinearSVC(self):
             n_features=n_features,
             n_classes=n_classes,
             n_informative=n_informative,
-            n_redundant=n_features - n_informative
+            n_redundant=n_features - n_informative,
         )
         lsvc = LinearSVC(max_iter=1000)
         start = time.time()
@@ -33,16 +33,21 @@ def test_LinearSVC(self):
 
         end = time.time()
         # print(end - start)
-        self.assertTrue(np.allclose(
-            lsvc.coef_.numpy(), reflsvc.coef_, atol=1e-2))
+        self.assertTrue(np.allclose(lsvc.coef_.numpy(), reflsvc.coef_, atol=1e-2))
         self.assertTrue(
             np.allclose(lsvc.intercept_.numpy(), reflsvc.intercept_, atol=1e-2)
         )
         self.assertTrue(
-            np.allclose(lsvc.decision_function(torch.from_numpy(x)), reflsvc.decision_function(x), atol=1e-2)
+            np.allclose(
+                lsvc.decision_function(torch.from_numpy(x)),
+                reflsvc.decision_function(x),
+                atol=1e-2,
+            )
         )
         self.assertTrue(
-            np.allclose(lsvc.predict(torch.from_numpy(x)), reflsvc.predict(x), atol=1e-2)
+            np.allclose(
+                lsvc.predict(torch.from_numpy(x)), reflsvc.predict(x), atol=1e-2
+            )
         )
 
 
diff --git a/tests/unit/svm/linear_svr_test.py b/tests/unit/svm/linear_svr_test.py
index e69de29..c5c7a69 100644
--- a/tests/unit/svm/linear_svr_test.py
+++ b/tests/unit/svm/linear_svr_test.py
@@ -0,0 +1,45 @@
+import unittest
+import numpy as np
+import torch
+from sklearn.datasets import make_regression
+import sklearn.svm as svm
+import time
+
+from torchml.svm import LinearSVR
+
+n_samples = 5000
+n_features = 10
+n_informative = 7
+
+
+class TestLinearSVR(unittest.TestCase):
+    def test_LinearSVR(self):
+        x, y = make_regression(
+            n_samples=n_samples,
+            n_features=n_features,
+            n_informative=n_informative,
+        )
+        lsvr = LinearSVR(max_iter=1000)
+        start = time.time()
+        lsvr.fit(torch.from_numpy(x), torch.from_numpy(y))
+        end = time.time()
+        print(end - start)
+        start = time.time()
+        reflsvr = svm.LinearSVR(max_iter=100000)
+        reflsvr.fit(x, y)
+
+        end = time.time()
+        print(end - start)
+        self.assertTrue(np.allclose(lsvr.coef_.numpy(), reflsvr.coef_, atol=1e-2))
+        self.assertTrue(
+            np.allclose(lsvr.intercept_.numpy(), reflsvr.intercept_, atol=1e-2)
+        )
+        self.assertTrue(
+            np.allclose(
+                lsvr.predict(torch.from_numpy(x)), reflsvr.predict(x), atol=1e-2
+            )
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index a7c6ebd..79f583d 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -6,20 +6,20 @@
 
 class LinearSVC(ml.Model):
     def __init__(
-            self,
-            penalty="l2",
-            loss="squared_hinge",
-            *,
-            dual=True,
-            tol=1e-4,
-            C=1.0,
-            multi_class="ovr",
-            fit_intercept=True,
-            intercept_scaling=1,
-            class_weight=None,
-            verbose=0,
-            random_state=None,
-            max_iter=1000,
+        self,
+        penalty="l2",
+        loss="squared_hinge",
+        *,
+        dual=True,
+        tol=1e-4,
+        C=1.0,
+        multi_class="ovr",
+        fit_intercept=True,
+        intercept_scaling=1,
+        class_weight=None,
+        verbose=0,
+        random_state=None,
+        max_iter=1000,
     ):
         super(LinearSVC, self).__init__()
         self.coef_ = None
@@ -40,8 +40,7 @@ def __init__(
 
     def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         if self.C < 0:
-            raise ValueError(
-                "Penalty term must be positive; got (C=%r)" % self.C)
+            raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
         self.classes_ = torch.unique(y)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
         m, n = X.shape
@@ -49,12 +48,13 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         self.intercept_ = torch.empty((0))
         if self.classes_.shape[0] == 2:
             self._fit_with_one_class(
-                X, y, self.classes_[1], sample_weight=sample_weight)
+                X, y, self.classes_[1], sample_weight=sample_weight
+            )
         else:
             for i, x in enumerate(self.classes_):
                 self._fit_with_one_class(X, y, x, sample_weight=sample_weight)
 
-    def decision_function(self, X : torch.Tensor) -> torch.Tensor:
+    def decision_function(self, X: torch.Tensor) -> torch.Tensor:
         return X @ self.coef_.T + self.intercept_
 
     def predict(self, X: torch.Tensor) -> torch.Tensor:
@@ -78,7 +78,9 @@ def predict(self, X: torch.Tensor) -> torch.Tensor:
             indices = scores.argmax(dim=1)
         return self.classes_[indices]
 
-    def _fit_with_one_class(self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None):
+    def _fit_with_one_class(
+        self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None
+    ):
 
         m, n = X.shape
 
@@ -119,8 +121,7 @@ def _fit_with_one_class(self, X: torch.Tensor, y: torch.Tensor, fitting_class: a
         C_param.value = self.C
         prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
 
-        self.coef_ = torch.cat(
-            (self.coef_, torch.t(torch.from_numpy(w.value))))
+        self.coef_ = torch.cat((self.coef_, torch.t(torch.from_numpy(w.value))))
         if self.fit_intercept:
             self.intercept_ = torch.cat(
                 (self.intercept_, torch.unsqueeze(torch.from_numpy(b.value), 0))
diff --git a/torchml/svm/linear_svr.py b/torchml/svm/linear_svr.py
index 3c7cc07..454c3c6 100644
--- a/torchml/svm/linear_svr.py
+++ b/torchml/svm/linear_svr.py
@@ -6,106 +6,61 @@
 
 class LinearSVR(ml.Model):
     def __init__(
-            self,
-            penalty="l2",
-            loss="squared_hinge",
-            *,
-            dual=True,
-            tol=1e-4,
-            C=1.0,
-            multi_class="ovr",
-            fit_intercept=True,
-            intercept_scaling=1,
-            class_weight=None,
-            verbose=0,
-            random_state=None,
-            max_iter=1000,
+        self,
+        *,
+        epsilon=0.0,
+        tol=1e-4,
+        C=1.0,
+        loss="epsilon_insensitive",
+        fit_intercept=True,
+        intercept_scaling=1.0,
+        dual=True,
+        verbose=0,
+        random_state=None,
+        max_iter=1000,
     ):
-        super(LinearSVC, self).__init__()
-        self.coef_ = None
+        super(LinearSVR, self).__init__()
         self.intercept_ = None
+        self.coef_ = None
         self.classes_ = None
-        self.dual = dual
         self.tol = tol
         self.C = C
-        self.multi_class = multi_class
+        self.epsilon = epsilon
         self.fit_intercept = fit_intercept
         self.intercept_scaling = intercept_scaling
-        self.class_weight = class_weight
         self.verbose = verbose
         self.random_state = random_state
         self.max_iter = max_iter
-        self.penalty = penalty
+        self.dual = dual
         self.loss = loss
 
     def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         if self.C < 0:
-            raise ValueError(
-                "Penalty term must be positive; got (C=%r)" % self.C)
-        self.classes_ = torch.unique(y)
+            raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
         m, n = X.shape
-        self.coef_ = torch.empty((0, n))
-        self.intercept_ = torch.empty((0))
-        if self.classes_.shape[0] == 2:
-            self._fit_with_one_class(
-                X, y, self.classes_[1], sample_weight=sample_weight)
-        else:
-            for i, x in enumerate(self.classes_):
-                self._fit_with_one_class(X, y, x, sample_weight=sample_weight)
-
-    def decision_function(self, X: torch.Tensor) -> torch.Tensor:
-        return X @ self.coef_.T + self.intercept_
-
-    def predict(self, X: torch.Tensor) -> torch.Tensor:
-        """
-        Predict class labels for samples in X.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The data matrix for which we want to get the predictions.
-
-        Returns
-        -------
-        y_pred : ndarray of shape (n_samples,)
-            Vector containing the class labels for each sample.
-        """
-        scores = self.decision_function(X)
-        if len(scores.shape) == 1:
-            indices = (scores > 0).int()
-        else:
-            indices = scores.argmax(dim=1)
-        return self.classes_[indices]
-
-    def _fit_with_one_class(self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None):
-
         m, n = X.shape
 
         y = torch.unsqueeze(y, 1)
 
-        y = (y == fitting_class).float()
-        y *= 2
-        y -= 1
-
         w = cp.Variable((n, 1))
         if self.fit_intercept:
             b = cp.Variable()
         X_param = cp.Parameter((m, n))
         y_param = cp.Parameter((m, 1))
         C_param = cp.Parameter(nonneg=True)
-        ones = torch.ones((m, 1))
+        epi_param = cp.Parameter()
 
         loss = cp.multiply((1 / 2.0), cp.norm(w, 2))
 
         if self.fit_intercept:
-            hinge = cp.pos(ones - cp.multiply(y_param, X_param @ w + b))
+            hinge = cp.pos(cp.abs(y_param - (X_param @ w + b)) - epi_param)
         else:
-            hinge = cp.pos(ones - cp.multiply(y_param, X_param @ w))
+            hinge = cp.pos(cp.abs(y_param - (X_param @ w + b)) - epi_param)
 
-        if self.loss == "squared_hinge":
+        if self.loss == "epsilon_insensitive":
             loss += C_param * cp.sum(cp.square(hinge))
-        elif self.loss == "hinge":
+        elif self.loss == "squared_epsilon_insensitive":
             loss += C_param * cp.sum(hinge)
 
         objective = loss
@@ -117,12 +72,13 @@ def _fit_with_one_class(self, X: torch.Tensor, y: torch.Tensor, fitting_class: a
         X_param.value = X.numpy()
         y_param.value = y.numpy()
         C_param.value = self.C
+        epi_param.value = self.epsilon
         prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
 
-        self.coef_ = torch.cat(
-            (self.coef_, torch.t(torch.from_numpy(w.value))))
-        if self.fit_intercept:
-            self.intercept_ = torch.cat(
-                (self.intercept_, torch.unsqueeze(torch.from_numpy(b.value), 0))
-            )
+        self.coef_, self.intercept_ = torch.flatten(
+            torch.from_numpy(w.value)
+        ), torch.flatten(torch.from_numpy(b.value))
         return self
+
+    def predict(self, X: torch.Tensor) -> torch.Tensor:
+        return X @ self.coef_ + self.intercept_

From 4abcbc9c075e9c216828a781e198712b299bdd8c Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Thu, 27 Oct 2022 22:53:07 -0700
Subject: [PATCH 14/31] black the repo

---
 tests/unit/svm/linear_svc_test.py | 15 +++++++----
 torchml/svm/linear_svc.py         | 43 ++++++++++++++++---------------
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 2b4296f..2f1826f 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -20,7 +20,7 @@ def test_LinearSVC(self):
             n_features=n_features,
             n_classes=n_classes,
             n_informative=n_informative,
-            n_redundant=n_features - n_informative
+            n_redundant=n_features - n_informative,
         )
         lsvc = LinearSVC(max_iter=1000)
         start = time.time()
@@ -33,16 +33,21 @@ def test_LinearSVC(self):
 
         end = time.time()
         # print(end - start)
-        self.assertTrue(np.allclose(
-            lsvc.coef_.numpy(), reflsvc.coef_, atol=1e-2))
+        self.assertTrue(np.allclose(lsvc.coef_.numpy(), reflsvc.coef_, atol=1e-2))
         self.assertTrue(
             np.allclose(lsvc.intercept_.numpy(), reflsvc.intercept_, atol=1e-2)
         )
         self.assertTrue(
-            np.allclose(lsvc.decision_function(torch.from_numpy(x)), reflsvc.decision_function(x), atol=1e-2)
+            np.allclose(
+                lsvc.decision_function(torch.from_numpy(x)),
+                reflsvc.decision_function(x),
+                atol=1e-2,
+            )
         )
         self.assertTrue(
-            np.allclose(lsvc.predict(torch.from_numpy(x)), reflsvc.predict(x), atol=1e-2)
+            np.allclose(
+                lsvc.predict(torch.from_numpy(x)), reflsvc.predict(x), atol=1e-2
+            )
         )
 
 
diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index a7c6ebd..79f583d 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -6,20 +6,20 @@
 
 class LinearSVC(ml.Model):
     def __init__(
-            self,
-            penalty="l2",
-            loss="squared_hinge",
-            *,
-            dual=True,
-            tol=1e-4,
-            C=1.0,
-            multi_class="ovr",
-            fit_intercept=True,
-            intercept_scaling=1,
-            class_weight=None,
-            verbose=0,
-            random_state=None,
-            max_iter=1000,
+        self,
+        penalty="l2",
+        loss="squared_hinge",
+        *,
+        dual=True,
+        tol=1e-4,
+        C=1.0,
+        multi_class="ovr",
+        fit_intercept=True,
+        intercept_scaling=1,
+        class_weight=None,
+        verbose=0,
+        random_state=None,
+        max_iter=1000,
     ):
         super(LinearSVC, self).__init__()
         self.coef_ = None
@@ -40,8 +40,7 @@ def __init__(
 
     def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         if self.C < 0:
-            raise ValueError(
-                "Penalty term must be positive; got (C=%r)" % self.C)
+            raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
         self.classes_ = torch.unique(y)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
         m, n = X.shape
@@ -49,12 +48,13 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         self.intercept_ = torch.empty((0))
         if self.classes_.shape[0] == 2:
             self._fit_with_one_class(
-                X, y, self.classes_[1], sample_weight=sample_weight)
+                X, y, self.classes_[1], sample_weight=sample_weight
+            )
         else:
             for i, x in enumerate(self.classes_):
                 self._fit_with_one_class(X, y, x, sample_weight=sample_weight)
 
-    def decision_function(self, X : torch.Tensor) -> torch.Tensor:
+    def decision_function(self, X: torch.Tensor) -> torch.Tensor:
         return X @ self.coef_.T + self.intercept_
 
     def predict(self, X: torch.Tensor) -> torch.Tensor:
@@ -78,7 +78,9 @@ def predict(self, X: torch.Tensor) -> torch.Tensor:
             indices = scores.argmax(dim=1)
         return self.classes_[indices]
 
-    def _fit_with_one_class(self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None):
+    def _fit_with_one_class(
+        self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None
+    ):
 
         m, n = X.shape
 
@@ -119,8 +121,7 @@ def _fit_with_one_class(self, X: torch.Tensor, y: torch.Tensor, fitting_class: a
         C_param.value = self.C
         prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
 
-        self.coef_ = torch.cat(
-            (self.coef_, torch.t(torch.from_numpy(w.value))))
+        self.coef_ = torch.cat((self.coef_, torch.t(torch.from_numpy(w.value))))
         if self.fit_intercept:
             self.intercept_ = torch.cat(
                 (self.intercept_, torch.unsqueeze(torch.from_numpy(b.value), 0))

From 7638bed593e5d75777a4136508f2c50d3c9aa156 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Sun, 30 Oct 2022 18:52:42 -0700
Subject: [PATCH 15/31] add docs for linearSVC

---
 torchml/svm/linear_svc.py | 104 ++++++++++++++++++++++++++++++++++----
 1 file changed, 95 insertions(+), 9 deletions(-)

diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index 79f583d..8d38b66 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -5,6 +5,77 @@
 
 
 class LinearSVC(ml.Model):
+    """
+    ## Description
+
+    Unsupervised learner for implementing KNN Classifier.
+
+    ## References
+
+    1. Bernhard E. Boser, Isabelle M. Guyon, and Vladimir N. Vapnik. 1992. A training algorithm for optimal margin classifiers. In Proceedings of the fifth annual workshop on Computational learning theory (COLT '92). Association for Computing Machinery, New York, NY, USA, 144–152. https://doi.org/10.1145/130385.130401
+    2. MIT 6.034 Artificial Intelligence, Fall 2010, [16. Learning: Support Vector Machines](https://youtu.be/_PwhiWxHK8o)
+    3. The scikit-learn [documentation page](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html) for LinearSVC.
+
+    ## Arguments
+
+    * `penalty` (str {'l1', 'l2'}, default=’l2’):
+        Specifies the norm used in the penalization.
+
+    * `loss` (str {‘hinge’, ‘squared_hinge’}, default=’squared_hinge’):
+        Specifies the loss function. ‘hinge’ is the standard SVM loss.
+
+    * `dual` (bool, default=True):
+        Dummy variable to keep consistency with SKlearn's API, always 'False' for now.
+
+    * `tol` (float, default=1e-4)
+        Tolerance for stopping criteria.
+
+    * `C` (float, default=1.0):
+        Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.
+
+    * `multi_class` (str {‘ovr’, ‘crammer_singer’}, default=’ovr’):
+        Dummy variable, always 'ovr' (one class over all the other as a single class)
+
+    * `fit_intercept` (bool, default=True):
+        Whether to calculate the intercept for this model.
+
+    * `intercept_scaling` (float, default=1):
+        Dummy variable to mimic the sklearn API, always 1 for now
+
+    * `class_weight` (dict or str ‘balanced’, default=None):
+        Dummy variable to mimic the sklearn API, always None for now
+
+    * `verbose` (int, default=0):
+        Dummy variable to mimic the sklearn API, always 0 for now
+
+    * `random_state` (int, RandomState instance or None, default=None):
+        Dummy variable to mimic the sklearn API, always None for now
+
+    * `max_iter` (int, default=1000):
+        The maximum number of iterations to be run for the underneath convex solver.
+
+
+    ## Example
+
+    ~~~python
+    import numpy as np
+    from torchml.svm import LinearSVC
+    from sklearn.datasets import make_classification
+
+    x, y = make_classification(
+            n_samples=n_samples,
+            n_features=n_features,
+            n_classes=n_classes,
+            n_informative=n_informative,
+            n_redundant=n_features - n_informative,
+        )
+    svc = LinearSVC(max_iter=1000)
+    svc.fit(torch.from_numpy(x), torch.from_numpy(y))
+    svc.decision_function(torch.from_numpy(x)
+    svc.predict(torch.from_numpy(x))
+    ~~~
+    """
+
     def __init__(
         self,
         penalty="l2",
@@ -39,6 +110,16 @@ def __init__(
         self.loss = loss
 
     def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
+        """
+        ## Description
+
+        Initialize the class with training sets
+
+        ## Arguments
+        * `X` (torch.Tensor): the training set
+        * `y` (torch.Tensor, default=None): the class labels for each sample
+
+        """
         if self.C < 0:
             raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
         self.classes_ = torch.unique(y)
@@ -55,21 +136,26 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
                 self._fit_with_one_class(X, y, x, sample_weight=sample_weight)
 
     def decision_function(self, X: torch.Tensor) -> torch.Tensor:
+        """
+        ## Description
+
+        Predict confidence scores for samples.
+
+        ## Arguments
+        * `X` (torch.Tensor): the data set for which we want to get the confidence scores.
+
+        """
         return X @ self.coef_.T + self.intercept_
 
     def predict(self, X: torch.Tensor) -> torch.Tensor:
         """
-        Predict class labels for samples in X.
+        ## Description
+
+        Predict the class labels for the provided data.
 
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The data matrix for which we want to get the predictions.
+        ## Arguments
 
-        Returns
-        -------
-        y_pred : ndarray of shape (n_samples,)
-            Vector containing the class labels for each sample.
+        * `X` (torch.Tensor): the target point
         """
         scores = self.decision_function(X)
         if len(scores.shape) == 1:

From 9610f737a7a427e5946a7707ce614baea729a11e Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Sun, 30 Oct 2022 18:55:24 -0700
Subject: [PATCH 16/31] fix doc

---
 torchml/svm/linear_svc.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index 8d38b66..e491842 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -8,7 +8,7 @@ class LinearSVC(ml.Model):
     """
     ## Description
 
-    Unsupervised learner for implementing KNN Classifier.
+    Support vector classifier with cvxpy
 
     ## References
 
@@ -121,7 +121,8 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
 
         """
         if self.C < 0:
-            raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
+            raise ValueError(
+                "Penalty term must be positive; got (C=%r)" % self.C)
         self.classes_ = torch.unique(y)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
         m, n = X.shape
@@ -207,7 +208,8 @@ def _fit_with_one_class(
         C_param.value = self.C
         prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
 
-        self.coef_ = torch.cat((self.coef_, torch.t(torch.from_numpy(w.value))))
+        self.coef_ = torch.cat(
+            (self.coef_, torch.t(torch.from_numpy(w.value))))
         if self.fit_intercept:
             self.intercept_ = torch.cat(
                 (self.intercept_, torch.unsqueeze(torch.from_numpy(b.value), 0))

From 63cdd9a87beb70ecc1f2ddadc37aaa755b974f36 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Sun, 30 Oct 2022 19:08:17 -0700
Subject: [PATCH 17/31] add docs for linearSVR

---
 torchml/svm/linear_svr.py | 82 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/torchml/svm/linear_svr.py b/torchml/svm/linear_svr.py
index 454c3c6..301a236 100644
--- a/torchml/svm/linear_svr.py
+++ b/torchml/svm/linear_svr.py
@@ -5,6 +5,68 @@
 
 
 class LinearSVR(ml.Model):
+    """
+    ## Description
+
+    Support vector regressor with cvxpy
+
+    ## References
+
+    1. Bernhard E. Boser, Isabelle M. Guyon, and Vladimir N. Vapnik. 1992. A training algorithm for optimal margin classifiers. In Proceedings of the fifth annual workshop on Computational learning theory (COLT '92). Association for Computing Machinery, New York, NY, USA, 144–152. https://doi.org/10.1145/130385.130401
+    2. MIT 6.034 Artificial Intelligence, Fall 2010, [16. Learning: Support Vector Machines](https://youtu.be/_PwhiWxHK8o)
+    3. The scikit-learn [documentation page](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html) for LinearSVC.
+
+    ## Arguments
+
+    * `loss` (str {‘epsilon_insensitive’, ‘squared_epsilon_insensitive’}, default=’epsilon_insensitive’):
+        Specifies the loss function.
+
+    * `epsilon` (float, default=0.0):
+        Epsilon parameter in the epsilon-insensitive loss function.
+
+    * `tol` (float, default=1e-4)
+        Tolerance for stopping criteria.
+
+    * `C` (float, default=1.0):
+        Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.
+
+    * `fit_intercept` (bool, default=True):
+        Whether to calculate the intercept for this model.
+
+    * `intercept_scaling` (float, default=1):
+        Dummy variable to mimic the sklearn API, always 1 for now
+
+    * `dual` (bool, default=True):
+        Dummy variable to keep consistency with SKlearn's API, always 'False' for now.
+
+    * `verbose` (int, default=0):
+        Dummy variable to mimic the sklearn API, always 0 for now
+
+    * `random_state` (int, RandomState instance or None, default=None):
+        Dummy variable to mimic the sklearn API, always None for now
+
+    * `max_iter` (int, default=1000):
+        The maximum number of iterations to be run for the underneath convex solver.
+
+
+    ## Example
+
+    ~~~python
+    import numpy as np
+    from torchml.svm import LinearSVR
+    from sklearn.datasets import make_regression
+
+    x, y = make_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=n_informative,
+    )
+    svr = LinearSVR(max_iter=1000)
+    svr.fit(torch.from_numpy(x), torch.from_numpy(y))
+    svr.predict(torch.from_numpy(x))
+    ~~~
+    """
+
     def __init__(
         self,
         *,
@@ -35,6 +97,17 @@ def __init__(
         self.loss = loss
 
     def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
+        """
+        ## Description
+
+        Initialize the class with training sets
+
+        ## Arguments
+        * `X` (torch.Tensor): the training set
+        * `y` (torch.Tensor): Target vector relative to X.
+        * `sample_weight` (default=None): Dummy variable for feature not supported yet.
+        """
+
         if self.C < 0:
             raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
@@ -81,4 +154,13 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         return self
 
     def predict(self, X: torch.Tensor) -> torch.Tensor:
+        """
+        ## Description
+
+        Predict using the linear model
+
+        ## Arguments
+
+        * `X` (torch.Tensor): Samples.
+        """
         return X @ self.coef_ + self.intercept_

From 943224aaf25eb658453103b6a38e0a7c3ac82485 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Fri, 4 Nov 2022 20:53:29 -0700
Subject: [PATCH 18/31] add dpp formulation

---
 tests/unit/svm/linear_svc_test.py |  4 ++--
 torchml/svm/linear_svc.py         | 34 ++++++++++++++++++++-----------
 torchml/svm/linear_svr.py         |  3 ++-
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 2f1826f..9172cc4 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -8,9 +8,9 @@
 from torchml.svm import LinearSVC
 
 n_samples = 5000
-n_features = 10
+n_features = 5
 n_classes = 5
-n_informative = 7
+n_informative = 5
 
 
 class TestLinearSVC(unittest.TestCase):
diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index 79f583d..f05ac25 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -2,6 +2,7 @@
 
 import torchml as ml
 import cvxpy as cp
+from cvxpylayers.torch import CvxpyLayer
 
 
 class LinearSVC(ml.Model):
@@ -94,21 +95,19 @@ def _fit_with_one_class(
         if self.fit_intercept:
             b = cp.Variable()
         X_param = cp.Parameter((m, n))
-        y_param = cp.Parameter((m, 1))
-        C_param = cp.Parameter(nonneg=True)
         ones = torch.ones((m, 1))
 
         loss = cp.multiply((1 / 2.0), cp.norm(w, 2))
 
         if self.fit_intercept:
-            hinge = cp.pos(ones - cp.multiply(y_param, X_param @ w + b))
+            hinge = cp.pos(ones - cp.multiply(y, X_param @ w + b))
         else:
-            hinge = cp.pos(ones - cp.multiply(y_param, X_param @ w))
+            hinge = cp.pos(ones - cp.multiply(y, X_param @ w))
 
         if self.loss == "squared_hinge":
-            loss += C_param * cp.sum(cp.square(hinge))
+            loss += cp.multiply(self.C,  cp.sum(cp.square(hinge)))
         elif self.loss == "hinge":
-            loss += C_param * cp.sum(hinge)
+            loss += cp.multiply(self.C, cp.sum(hinge))
 
         objective = loss
 
@@ -116,14 +115,25 @@ def _fit_with_one_class(
         constraints = []
 
         prob = cp.Problem(cp.Minimize(objective), constraints)
-        X_param.value = X.numpy()
-        y_param.value = y.numpy()
-        C_param.value = self.C
-        prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
+        assert prob.is_dpp()
+
+        # convert into pytorch layer
+        if self.fit_intercept:
+            fit_lr = CvxpyLayer(prob, [X_param], [w, b])
+        else:
+            fit_lr = CvxpyLayer(prob, [X_param], [w])
+
+        # prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
+        if self.fit_intercept:
+            weight, intercept = fit_lr(X, solver_args={"solve_method": "ECOS", "abstol": self.tol, "max_iters": self.max_iter})
+        else:
+            weight = fit_lr(X, solver_args={"solve_method": "ECOS", "abstol": self.tol, "max_iters": self.max_iter})
+
+        self.coef_ = torch.cat((self.coef_, torch.t(weight)))
+
 
-        self.coef_ = torch.cat((self.coef_, torch.t(torch.from_numpy(w.value))))
         if self.fit_intercept:
             self.intercept_ = torch.cat(
-                (self.intercept_, torch.unsqueeze(torch.from_numpy(b.value), 0))
+                (self.intercept_, torch.unsqueeze(intercept, 0))
             )
         return self
diff --git a/torchml/svm/linear_svr.py b/torchml/svm/linear_svr.py
index 301a236..b24194e 100644
--- a/torchml/svm/linear_svr.py
+++ b/torchml/svm/linear_svr.py
@@ -109,7 +109,8 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         """
 
         if self.C < 0:
-            raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
+            raise ValueError(
+                "Penalty term must be positive; got (C=%r)" % self.C)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
         m, n = X.shape
         m, n = X.shape

From 22afc05ddaedf2f0c74b46e444bf9eb29e977b6b Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Fri, 4 Nov 2022 21:22:52 -0700
Subject: [PATCH 19/31] add gradient support

---
 tests/unit/svm/linear_svc_test.py |  2 +-
 torchml/svm/linear_svc.py         | 41 ++++++++++++++++---------------
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 9172cc4..35c4169 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -7,7 +7,7 @@
 
 from torchml.svm import LinearSVC
 
-n_samples = 5000
+n_samples = 1000
 n_features = 5
 n_classes = 5
 n_informative = 5
diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index f05ac25..18fa800 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -7,20 +7,20 @@
 
 class LinearSVC(ml.Model):
     def __init__(
-        self,
-        penalty="l2",
-        loss="squared_hinge",
-        *,
-        dual=True,
-        tol=1e-4,
-        C=1.0,
-        multi_class="ovr",
-        fit_intercept=True,
-        intercept_scaling=1,
-        class_weight=None,
-        verbose=0,
-        random_state=None,
-        max_iter=1000,
+            self,
+            penalty="l2",
+            loss="squared_hinge",
+            *,
+            dual=True,
+            tol=1e-4,
+            C=1.0,
+            multi_class="ovr",
+            fit_intercept=True,
+            intercept_scaling=1,
+            class_weight=None,
+            verbose=0,
+            random_state=None,
+            max_iter=1000,
     ):
         super(LinearSVC, self).__init__()
         self.coef_ = None
@@ -56,7 +56,8 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
                 self._fit_with_one_class(X, y, x, sample_weight=sample_weight)
 
     def decision_function(self, X: torch.Tensor) -> torch.Tensor:
-        return X @ self.coef_.T + self.intercept_
+        scores = X @ self.coef_.T + self.intercept_
+        return scores.ravel() if scores.shape[1] == 1 else scores
 
     def predict(self, X: torch.Tensor) -> torch.Tensor:
         """
@@ -74,13 +75,13 @@ def predict(self, X: torch.Tensor) -> torch.Tensor:
         """
         scores = self.decision_function(X)
         if len(scores.shape) == 1:
-            indices = (scores > 0).int()
+            indices = (scores > 0).long()
         else:
             indices = scores.argmax(dim=1)
         return self.classes_[indices]
 
     def _fit_with_one_class(
-        self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None
+            self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None
     ):
 
         m, n = X.shape
@@ -105,7 +106,7 @@ def _fit_with_one_class(
             hinge = cp.pos(ones - cp.multiply(y, X_param @ w))
 
         if self.loss == "squared_hinge":
-            loss += cp.multiply(self.C,  cp.sum(cp.square(hinge)))
+            loss += cp.multiply(self.C, cp.sum(cp.square(hinge)))
         elif self.loss == "hinge":
             loss += cp.multiply(self.C, cp.sum(hinge))
 
@@ -125,13 +126,13 @@ def _fit_with_one_class(
 
         # prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
         if self.fit_intercept:
-            weight, intercept = fit_lr(X, solver_args={"solve_method": "ECOS", "abstol": self.tol, "max_iters": self.max_iter})
+            weight, intercept = fit_lr(X, solver_args={"solve_method": "ECOS", "abstol": self.tol,
+                                                       "max_iters": self.max_iter})
         else:
             weight = fit_lr(X, solver_args={"solve_method": "ECOS", "abstol": self.tol, "max_iters": self.max_iter})
 
         self.coef_ = torch.cat((self.coef_, torch.t(weight)))
 
-
         if self.fit_intercept:
             self.intercept_ = torch.cat(
                 (self.intercept_, torch.unsqueeze(intercept, 0))

From 9364f721167696afb09b3c75de1d9c2862b5e95d Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Fri, 4 Nov 2022 21:25:05 -0700
Subject: [PATCH 20/31] make format

---
 torchml/svm/linear_svc.py | 24 ++++++++++++++++++------
 torchml/svm/linear_svr.py |  3 +--
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index c35e37d..ff9614e 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -147,7 +147,7 @@ def decision_function(self, X: torch.Tensor) -> torch.Tensor:
 
         """
         scores = X @ self.coef_.T + self.intercept_
-        return scores.ravel() if scores.shape[1] == 1 else scores 
+        return scores.ravel() if scores.shape[1] == 1 else scores
 
     def predict(self, X: torch.Tensor) -> torch.Tensor:
         """
@@ -167,9 +167,8 @@ def predict(self, X: torch.Tensor) -> torch.Tensor:
         return self.classes_[indices]
 
     def _fit_with_one_class(
-            self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None
+        self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None
     ):
-
         m, n = X.shape
 
         y = torch.unsqueeze(y, 1)
@@ -212,10 +211,23 @@ def _fit_with_one_class(
 
         # prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
         if self.fit_intercept:
-            weight, intercept = fit_lr(X, solver_args={"solve_method": "ECOS", "abstol": self.tol,
-                                                       "max_iters": self.max_iter})
+            weight, intercept = fit_lr(
+                X,
+                solver_args={
+                    "solve_method": "ECOS",
+                    "abstol": self.tol,
+                    "max_iters": self.max_iter,
+                },
+            )
         else:
-            weight = fit_lr(X, solver_args={"solve_method": "ECOS", "abstol": self.tol, "max_iters": self.max_iter})
+            weight = fit_lr(
+                X,
+                solver_args={
+                    "solve_method": "ECOS",
+                    "abstol": self.tol,
+                    "max_iters": self.max_iter,
+                },
+            )
 
         self.coef_ = torch.cat((self.coef_, torch.t(weight)))
 
diff --git a/torchml/svm/linear_svr.py b/torchml/svm/linear_svr.py
index b24194e..301a236 100644
--- a/torchml/svm/linear_svr.py
+++ b/torchml/svm/linear_svr.py
@@ -109,8 +109,7 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         """
 
         if self.C < 0:
-            raise ValueError(
-                "Penalty term must be positive; got (C=%r)" % self.C)
+            raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
         m, n = X.shape
         m, n = X.shape

From 3c0d3ce1a4c6a91ac5ea006c7413f6644bf2e46c Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Sat, 5 Nov 2022 16:55:01 -0700
Subject: [PATCH 21/31] fix tests

---
 tests/unit/svm/linear_svc_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 35c4169..82fd726 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -7,7 +7,7 @@
 
 from torchml.svm import LinearSVC
 
-n_samples = 1000
+n_samples = 4000
 n_features = 5
 n_classes = 5
 n_informative = 5
@@ -33,7 +33,8 @@ def test_LinearSVC(self):
 
         end = time.time()
         # print(end - start)
-        self.assertTrue(np.allclose(lsvc.coef_.numpy(), reflsvc.coef_, atol=1e-2))
+        self.assertTrue(np.allclose(
+            lsvc.coef_.numpy(), reflsvc.coef_, atol=1e-2))
         self.assertTrue(
             np.allclose(lsvc.intercept_.numpy(), reflsvc.intercept_, atol=1e-2)
         )

From 60f670c3ddcaf0a1520b4a7ae8a627a682b238a8 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Thu, 10 Nov 2022 18:12:51 -0800
Subject: [PATCH 22/31] add pylayers to linear svr

---
 .../neighbors/k_neighbors_classifier_test.py  |  2 +-
 .../unit/neighbors/nearest_neighbors_test.py  |  2 +-
 tests/unit/svm/linear_svc_test.py             |  2 +-
 tests/unit/svm/linear_svr_test.py             |  4 +-
 torchml/svm/linear_svr.py                     | 52 +++++++++++++------
 5 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/tests/unit/neighbors/k_neighbors_classifier_test.py b/tests/unit/neighbors/k_neighbors_classifier_test.py
index 1db5479..14fc05a 100644
--- a/tests/unit/neighbors/k_neighbors_classifier_test.py
+++ b/tests/unit/neighbors/k_neighbors_classifier_test.py
@@ -10,7 +10,7 @@
 
 class TestkneighborsClassifier(unittest.TestCase):
     def test_knn_classifier(self):
-        for i in range(1, 20, 1):
+        for i in range(1, 5, 1):
             X = np.random.randn(BSZ, DIM)
             y = np.random.randint(low=-100, high=100, size=BSZ)
             p = np.random.randn(5, DIM)
diff --git a/tests/unit/neighbors/nearest_neighbors_test.py b/tests/unit/neighbors/nearest_neighbors_test.py
index 28ceb54..c74053a 100644
--- a/tests/unit/neighbors/nearest_neighbors_test.py
+++ b/tests/unit/neighbors/nearest_neighbors_test.py
@@ -10,7 +10,7 @@
 
 class Testkneighbors(unittest.TestCase):
     def test_kneighbors(self):
-        for i in range(1, 200, 1):
+        for i in range(1, 5, 1):
             X = np.random.randn(BSZ, DIM)
             y = np.random.randn(5, DIM)
             ref = neighbors.NearestNeighbors(p=i)
diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 82fd726..32e8390 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -7,7 +7,7 @@
 
 from torchml.svm import LinearSVC
 
-n_samples = 4000
+n_samples = 5000
 n_features = 5
 n_classes = 5
 n_informative = 5
diff --git a/tests/unit/svm/linear_svr_test.py b/tests/unit/svm/linear_svr_test.py
index c5c7a69..6a0fa33 100644
--- a/tests/unit/svm/linear_svr_test.py
+++ b/tests/unit/svm/linear_svr_test.py
@@ -23,13 +23,13 @@ def test_LinearSVR(self):
         start = time.time()
         lsvr.fit(torch.from_numpy(x), torch.from_numpy(y))
         end = time.time()
-        print(end - start)
+        # print(end - start)
         start = time.time()
         reflsvr = svm.LinearSVR(max_iter=100000)
         reflsvr.fit(x, y)
 
         end = time.time()
-        print(end - start)
+        # print(end - start)
         self.assertTrue(np.allclose(lsvr.coef_.numpy(), reflsvr.coef_, atol=1e-2))
         self.assertTrue(
             np.allclose(lsvr.intercept_.numpy(), reflsvr.intercept_, atol=1e-2)
diff --git a/torchml/svm/linear_svr.py b/torchml/svm/linear_svr.py
index 301a236..2805ec7 100644
--- a/torchml/svm/linear_svr.py
+++ b/torchml/svm/linear_svr.py
@@ -2,6 +2,7 @@
 
 import torchml as ml
 import cvxpy as cp
+from cvxpylayers.torch import CvxpyLayer
 
 
 class LinearSVR(ml.Model):
@@ -109,7 +110,8 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         """
 
         if self.C < 0:
-            raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
+            raise ValueError(
+                "Penalty term must be positive; got (C=%r)" % self.C)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
         m, n = X.shape
         m, n = X.shape
@@ -120,21 +122,18 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         if self.fit_intercept:
             b = cp.Variable()
         X_param = cp.Parameter((m, n))
-        y_param = cp.Parameter((m, 1))
-        C_param = cp.Parameter(nonneg=True)
-        epi_param = cp.Parameter()
 
         loss = cp.multiply((1 / 2.0), cp.norm(w, 2))
 
         if self.fit_intercept:
-            hinge = cp.pos(cp.abs(y_param - (X_param @ w + b)) - epi_param)
+            hinge = cp.pos(cp.abs(y - (X_param @ w + b)) - self.epsilon)
         else:
-            hinge = cp.pos(cp.abs(y_param - (X_param @ w + b)) - epi_param)
+            hinge = cp.pos(cp.abs(y - (X_param @ w + b)) - self.epsilon)
 
         if self.loss == "epsilon_insensitive":
-            loss += C_param * cp.sum(cp.square(hinge))
+            loss += self.C * cp.sum(cp.square(hinge))
         elif self.loss == "squared_epsilon_insensitive":
-            loss += C_param * cp.sum(hinge)
+            loss += self.C * cp.sum(hinge)
 
         objective = loss
 
@@ -142,15 +141,36 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         constraints = []
 
         prob = cp.Problem(cp.Minimize(objective), constraints)
+        assert prob.is_dpp()
         X_param.value = X.numpy()
-        y_param.value = y.numpy()
-        C_param.value = self.C
-        epi_param.value = self.epsilon
-        prob.solve(solver="ECOS", abstol=self.tol, max_iters=self.max_iter)
-
-        self.coef_, self.intercept_ = torch.flatten(
-            torch.from_numpy(w.value)
-        ), torch.flatten(torch.from_numpy(b.value))
+        if self.fit_intercept:
+            fit_lr = CvxpyLayer(prob, [X_param], [w, b])
+        else:
+            fit_lr = CvxpyLayer(prob, [X_param], [w])
+
+        if self.fit_intercept:
+            self.coef_, self.intercept_ = fit_lr(
+                X,
+                solver_args={
+                    "solve_method": "ECOS",
+                    "abstol": self.tol,
+                    "max_iters": self.max_iter,
+                },
+            )
+        else:
+            self.coef_, = fit_lr(
+                X,
+                solver_args={
+                    "solve_method": "ECOS",
+                    "abstol": self.tol,
+                    "max_iters": self.max_iter,
+                },
+            )
+
+        self.coef_ = torch.flatten(self.coef_)
+        if self.fit_intercept:
+            self.intercept_ = torch.flatten(self.intercept_)
+
         return self
 
     def predict(self, X: torch.Tensor) -> torch.Tensor:

From 902e13f2097abe6a0e92fab8022172fcb5e93109 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Thu, 10 Nov 2022 18:15:04 -0800
Subject: [PATCH 23/31] make format

---
 tests/unit/svm/linear_svc_test.py | 3 +--
 torchml/svm/linear_svr.py         | 5 ++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 32e8390..9172cc4 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -33,8 +33,7 @@ def test_LinearSVC(self):
 
         end = time.time()
         # print(end - start)
-        self.assertTrue(np.allclose(
-            lsvc.coef_.numpy(), reflsvc.coef_, atol=1e-2))
+        self.assertTrue(np.allclose(lsvc.coef_.numpy(), reflsvc.coef_, atol=1e-2))
         self.assertTrue(
             np.allclose(lsvc.intercept_.numpy(), reflsvc.intercept_, atol=1e-2)
         )
diff --git a/torchml/svm/linear_svr.py b/torchml/svm/linear_svr.py
index 2805ec7..14047e6 100644
--- a/torchml/svm/linear_svr.py
+++ b/torchml/svm/linear_svr.py
@@ -110,8 +110,7 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         """
 
         if self.C < 0:
-            raise ValueError(
-                "Penalty term must be positive; got (C=%r)" % self.C)
+            raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
         m, n = X.shape
         m, n = X.shape
@@ -158,7 +157,7 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
                 },
             )
         else:
-            self.coef_, = fit_lr(
+            (self.coef_,) = fit_lr(
                 X,
                 solver_args={
                     "solve_method": "ECOS",

From 28840a409e3a160bc9d22dba8d7ee97ec31fa0c6 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Thu, 10 Nov 2022 18:39:53 -0800
Subject: [PATCH 24/31] add knnclassifier gradcheck

---
 tests/unit/neighbors/k_neighbors_classifier_test.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/unit/neighbors/k_neighbors_classifier_test.py b/tests/unit/neighbors/k_neighbors_classifier_test.py
index 14fc05a..e0075f2 100644
--- a/tests/unit/neighbors/k_neighbors_classifier_test.py
+++ b/tests/unit/neighbors/k_neighbors_classifier_test.py
@@ -3,6 +3,8 @@
 import torch
 import torchml as ml
 import sklearn.neighbors as neighbors
+from torch.autograd import gradcheck
+
 
 BSZ = 1000
 DIM = 50
@@ -13,7 +15,7 @@ def test_knn_classifier(self):
         for i in range(1, 5, 1):
             X = np.random.randn(BSZ, DIM)
             y = np.random.randint(low=-100, high=100, size=BSZ)
-            p = np.random.randn(5, DIM)
+            p = np.random.randn(1, DIM)
 
             ref = neighbors.KNeighborsClassifier(
                 weights="distance" if i % 2 else "uniform", p=i
@@ -26,13 +28,19 @@ def test_knn_classifier(self):
                 weights="distance" if i % 2 else "uniform", p=i
             )
             test.fit(torch.from_numpy(X), torch.from_numpy(y))
+            inputP = torch.from_numpy(p)
+            inputP.requires_grad = True
+            
             testr = test.predict(torch.from_numpy(p))
             testp = test.predict_proba(torch.from_numpy(p))
+            self.assertTrue(gradcheck(test.predict, inputP, eps=1e-6, atol=1e-3))
+            # self.assertTrue(gradcheck(test.predict_proba, inputP, eps=1e-20, atol=1e-3))
             self.assertTrue(np.allclose(refr, testr.numpy()))
             self.assertTrue(np.allclose(refp, testp.numpy()))
 
             refr2 = ref.kneighbors(p)
             testr2 = test.kneighbors(torch.from_numpy(p))
+            self.assertTrue(gradcheck(test.kneighbors, inputP, eps=1e-6, atol=1e-3))
             self.assertTrue(np.allclose(refr2[0], testr2[0].numpy()))
             self.assertTrue(np.allclose(refr2[1], testr2[1].numpy()))
 

From 899e6f16eabd95321e9789403e89c95c5981b988 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Thu, 10 Nov 2022 18:45:58 -0800
Subject: [PATCH 25/31] add gradcheck to neighbors

---
 tests/unit/neighbors/k_neighbors_classifier_test.py | 2 +-
 tests/unit/neighbors/nearest_centroids_test.py      | 4 ++++
 tests/unit/neighbors/nearest_neighbors_test.py      | 5 +++++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/unit/neighbors/k_neighbors_classifier_test.py b/tests/unit/neighbors/k_neighbors_classifier_test.py
index e0075f2..7333fbc 100644
--- a/tests/unit/neighbors/k_neighbors_classifier_test.py
+++ b/tests/unit/neighbors/k_neighbors_classifier_test.py
@@ -15,7 +15,7 @@ def test_knn_classifier(self):
         for i in range(1, 5, 1):
             X = np.random.randn(BSZ, DIM)
             y = np.random.randint(low=-100, high=100, size=BSZ)
-            p = np.random.randn(1, DIM)
+            p = np.random.randn(5, DIM)
 
             ref = neighbors.KNeighborsClassifier(
                 weights="distance" if i % 2 else "uniform", p=i
diff --git a/tests/unit/neighbors/nearest_centroids_test.py b/tests/unit/neighbors/nearest_centroids_test.py
index 4a6dc5e..5f2b35b 100644
--- a/tests/unit/neighbors/nearest_centroids_test.py
+++ b/tests/unit/neighbors/nearest_centroids_test.py
@@ -3,6 +3,7 @@
 import torch
 import torchml as ml
 import sklearn.neighbors as neighbors
+from torch.autograd import gradcheck
 
 # define numbers of classes & features
 SAMPLES = 10
@@ -26,6 +27,9 @@ def test_kneighbors(self):
             refres = ref.predict(samp)
             centres = cent.predict(torch.from_numpy(samp)).numpy()
             self.assertTrue(np.array_equal(refres, centres))
+            inputSamp = torch.from_numpy(samp)
+            inputSamp.requires_grad = True
+            self.assertTrue(gradcheck(cent.predict, inputSamp, eps=1e-6, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/neighbors/nearest_neighbors_test.py b/tests/unit/neighbors/nearest_neighbors_test.py
index c74053a..ea274fe 100644
--- a/tests/unit/neighbors/nearest_neighbors_test.py
+++ b/tests/unit/neighbors/nearest_neighbors_test.py
@@ -3,6 +3,7 @@
 import torch
 import torchml as ml
 import sklearn.neighbors as neighbors
+from torch.autograd import gradcheck
 
 BSZ = 128
 DIM = 5
@@ -24,6 +25,9 @@ def test_kneighbors(self):
             # return distance is true
             self.assertTrue(np.allclose(test[0], res[0].numpy()))
             self.assertTrue(np.allclose(test[1], res[1].numpy()))
+            inputY = torch.from_numpy(y)
+            inputY.requires_grad = True
+            self.assertTrue(gradcheck(model.kneighbors, inputY, eps=1e-6, atol=1e-3))
 
             ref = neighbors.NearestNeighbors(p=i)
             ref.fit(X)
@@ -35,6 +39,7 @@ def test_kneighbors(self):
 
             # return distance is false
             self.assertTrue(np.allclose(test, res.numpy()))
+            self.assertTrue(gradcheck(model.kneighbors, inputY, eps=1e-6, atol=1e-3))
 
 
 if __name__ == "__main__":

From 688c4395541a8b88dcc974a887802a3b23a9099d Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Thu, 10 Nov 2022 18:58:14 -0800
Subject: [PATCH 26/31] add gradcheck to all

---
 .../gaussian_naive_bayes/gaussian_nb_test.py     |  5 +++++
 tests/unit/linear_model/lasso_test.py            | 16 ++++++++++++++++
 .../unit/linear_model/linear_regression_test.py  |  6 ++++++
 tests/unit/linear_model/ridge_test.py            | 11 +++++++++++
 tests/unit/svm/linear_svc_test.py                |  7 +++++++
 tests/unit/svm/linear_svr_test.py                |  5 +++++
 torchml/svm/linear_svr.py                        |  2 +-
 7 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/tests/unit/gaussian_naive_bayes/gaussian_nb_test.py b/tests/unit/gaussian_naive_bayes/gaussian_nb_test.py
index 3c66c09..73b994b 100644
--- a/tests/unit/gaussian_naive_bayes/gaussian_nb_test.py
+++ b/tests/unit/gaussian_naive_bayes/gaussian_nb_test.py
@@ -3,6 +3,7 @@
 import torch
 import torchml as ml
 from sklearn.naive_bayes import GaussianNB
+from torch.autograd import gradcheck
 
 
 BSZ = 128
@@ -25,6 +26,10 @@ def test_fit(self):
 
         self.assertTrue(np.allclose(ref_preds, model_preds.numpy()))
         self.assertTrue(np.allclose(ref_preds, model_forward.numpy()))
+        inputX = torch.from_numpy(X)
+        inputX.requires_grad = True
+        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/linear_model/lasso_test.py b/tests/unit/linear_model/lasso_test.py
index b02a15e..72b525d 100644
--- a/tests/unit/linear_model/lasso_test.py
+++ b/tests/unit/linear_model/lasso_test.py
@@ -3,6 +3,7 @@
 import torch
 import torchml as ml
 import sklearn.linear_model as linear_model
+from torch.autograd import gradcheck
 
 
 BSZ = 128
@@ -32,6 +33,11 @@ def test_fit(self):
             )
         )
 
+        inputX = torch.from_numpy(X)
+        inputX.requires_grad = True
+        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+
     def test_fit_intercept(self):
         X = np.random.randn(BSZ, DIM)
         y = np.random.randn(BSZ, 1)
@@ -54,6 +60,11 @@ def test_fit_intercept(self):
             )
         )
 
+        inputX = torch.from_numpy(X)
+        inputX.requires_grad = True
+        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+
     def test_fit_positive(self):
         X = np.random.randn(BSZ, DIM)
         y = np.random.randn(BSZ, 1)
@@ -76,6 +87,11 @@ def test_fit_positive(self):
             )
         )
 
+        inputX = torch.from_numpy(X)
+        inputX.requires_grad = True
+        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unit/linear_model/linear_regression_test.py b/tests/unit/linear_model/linear_regression_test.py
index 0d36710..7eab44e 100644
--- a/tests/unit/linear_model/linear_regression_test.py
+++ b/tests/unit/linear_model/linear_regression_test.py
@@ -3,6 +3,7 @@
 import torch
 import torchml as ml
 import sklearn.linear_model as linear_model
+from torch.autograd import gradcheck
 
 
 BSZ = 128
@@ -26,6 +27,11 @@ def test_fit(self):
         self.assertTrue(np.allclose(ref_preds, model_preds.numpy()))
         self.assertTrue(np.allclose(ref_preds, model_forward.numpy()))
 
+        inputX = torch.from_numpy(X)
+        inputX.requires_grad = True
+        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unit/linear_model/ridge_test.py b/tests/unit/linear_model/ridge_test.py
index 315275e..30c257e 100644
--- a/tests/unit/linear_model/ridge_test.py
+++ b/tests/unit/linear_model/ridge_test.py
@@ -3,6 +3,7 @@
 import torch
 import torchml as ml
 import sklearn.linear_model as linear_model
+from torch.autograd import gradcheck
 
 
 BSZ = 128
@@ -26,6 +27,11 @@ def test_fit(self):
         self.assertTrue(np.allclose(ref_preds, model_preds.numpy()))
         self.assertTrue(np.allclose(ref_preds, model_forward.numpy()))
 
+        inputX = torch.from_numpy(X)
+        inputX.requires_grad = True
+        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+
     def test_fit_intercept(self):
         X = np.random.randn(BSZ, DIM)
         y = np.random.randn(BSZ, 1)
@@ -42,6 +48,11 @@ def test_fit_intercept(self):
         self.assertTrue(np.allclose(ref_preds, model_preds.numpy()))
         self.assertTrue(np.allclose(ref_preds, model_forward.numpy()))
 
+        inputX = torch.from_numpy(X)
+        inputX.requires_grad = True
+        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 9172cc4..3561794 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -4,6 +4,7 @@
 from sklearn.datasets import make_classification
 import sklearn.svm as svm
 import time
+from torch.autograd import gradcheck
 
 from torchml.svm import LinearSVC
 
@@ -44,11 +45,17 @@ def test_LinearSVC(self):
                 atol=1e-2,
             )
         )
+
+        inputX = torch.from_numpy(x)
+        inputX.requires_grad = True
+        self.assertTrue(gradcheck(lsvc.decision_function, inputX, eps=1e-6, atol=1e-3))
+
         self.assertTrue(
             np.allclose(
                 lsvc.predict(torch.from_numpy(x)), reflsvc.predict(x), atol=1e-2
             )
         )
+        self.assertTrue(gradcheck(lsvc.predict, inputX, eps=1e-6, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/svm/linear_svr_test.py b/tests/unit/svm/linear_svr_test.py
index 6a0fa33..e6b0857 100644
--- a/tests/unit/svm/linear_svr_test.py
+++ b/tests/unit/svm/linear_svr_test.py
@@ -4,6 +4,7 @@
 from sklearn.datasets import make_regression
 import sklearn.svm as svm
 import time
+from torch.autograd import gradcheck
 
 from torchml.svm import LinearSVR
 
@@ -40,6 +41,10 @@ def test_LinearSVR(self):
             )
         )
 
+        inputX = torch.from_numpy(x)
+        inputX.requires_grad = True
+        self.assertTrue(gradcheck(lsvr.predict, inputX, eps=1e-6, atol=1e-3))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchml/svm/linear_svr.py b/torchml/svm/linear_svr.py
index 14047e6..913390e 100644
--- a/torchml/svm/linear_svr.py
+++ b/torchml/svm/linear_svr.py
@@ -141,7 +141,7 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
 
         prob = cp.Problem(cp.Minimize(objective), constraints)
         assert prob.is_dpp()
-        X_param.value = X.numpy()
+
         if self.fit_intercept:
             fit_lr = CvxpyLayer(prob, [X_param], [w, b])
         else:

From 2a0f810022f94d21d2b8a01dc7f55f15ef5284d7 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Sun, 13 Nov 2022 20:37:02 -0800
Subject: [PATCH 27/31] add lasso gpu support

---
 tests/unit/linear_model/lasso_test.py | 145 ++++++++++++++------------
 torchml/linear_model/lasso.py         |   6 +-
 2 files changed, 80 insertions(+), 71 deletions(-)

diff --git a/tests/unit/linear_model/lasso_test.py b/tests/unit/linear_model/lasso_test.py
index 72b525d..f88ed25 100644
--- a/tests/unit/linear_model/lasso_test.py
+++ b/tests/unit/linear_model/lasso_test.py
@@ -12,85 +12,92 @@
 
 class TestLasso(unittest.TestCase):
     def test_fit(self):
-        X = np.random.randn(BSZ, DIM)
-        y = np.random.randn(BSZ, 1)
-
-        ref = linear_model.Lasso(fit_intercept=False)
-        ref.fit(X, y)
-        ref_preds = ref.predict(X)
-
-        model = ml.linear_model.Lasso()
-        model.fit(torch.from_numpy(X), torch.from_numpy(y))
-        model_preds = model.predict(torch.from_numpy(X))
-        model_forward = model(torch.from_numpy(X))
-
-        self.assertTrue(
-            np.allclose(ref_preds, model_preds[0].detach().numpy().flatten(), atol=1e-3)
-        )
-        self.assertTrue(
-            np.allclose(
-                ref_preds, model_forward[0].detach().numpy().flatten(), atol=1e-3
+        for i in range(2):
+            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+
+            X = np.random.randn(BSZ, DIM)
+            y = np.random.randn(BSZ, 1)
+
+            ref = linear_model.Lasso(fit_intercept=False)
+            ref.fit(X, y)
+            ref_preds = ref.predict(X)
+
+            model = ml.linear_model.Lasso()
+            model.fit(torch.from_numpy(X).to(device), torch.from_numpy(y).to(device))
+            model_preds = model.predict(torch.from_numpy(X).to(device))
+            model_forward = model(torch.from_numpy(X).to(device))
+
+            self.assertTrue(
+                np.allclose(ref_preds, model_preds[0].detach().cpu().numpy().flatten(), atol=1e-3)
+            )
+            self.assertTrue(
+                np.allclose(
+                    ref_preds, model_forward[0].detach().cpu().numpy().flatten(), atol=1e-3
+                )
             )
-        )
 
-        inputX = torch.from_numpy(X)
-        inputX.requires_grad = True
-        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
-        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+            inputX = torch.from_numpy(X).to(device)
+            inputX.requires_grad = True
+            self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+            self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
 
     def test_fit_intercept(self):
-        X = np.random.randn(BSZ, DIM)
-        y = np.random.randn(BSZ, 1)
-
-        ref = linear_model.Lasso(fit_intercept=True)
-        ref.fit(X, y)
-        ref_preds = ref.predict(X)
-
-        model = ml.linear_model.Lasso(fit_intercept=True)
-        model.fit(torch.from_numpy(X), torch.from_numpy(y))
-        model_preds = model.predict(torch.from_numpy(X))
-        model_forward = model(torch.from_numpy(X))
-
-        self.assertTrue(
-            np.allclose(ref_preds, model_preds[0].detach().numpy().flatten(), atol=1e-3)
-        )
-        self.assertTrue(
-            np.allclose(
-                ref_preds, model_forward[0].detach().numpy().flatten(), atol=1e-3
+        for i in range(2):
+            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            X = np.random.randn(BSZ, DIM)
+            y = np.random.randn(BSZ, 1)
+
+            ref = linear_model.Lasso(fit_intercept=True)
+            ref.fit(X, y)
+            ref_preds = ref.predict(X)
+
+            model = ml.linear_model.Lasso(fit_intercept=True)
+            model.fit(torch.from_numpy(X).to(device), torch.from_numpy(y).to(device))
+            model_preds = model.predict(torch.from_numpy(X).to(device))
+            model_forward = model(torch.from_numpy(X).to(device))
+
+            self.assertTrue(
+                np.allclose(ref_preds, model_preds[0].detach().cpu().numpy().flatten(), atol=1e-3)
+            )
+            self.assertTrue(
+                np.allclose(
+                    ref_preds, model_forward[0].detach().cpu().numpy().flatten(), atol=1e-3
+                )
             )
-        )
 
-        inputX = torch.from_numpy(X)
-        inputX.requires_grad = True
-        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
-        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+            inputX = torch.from_numpy(X).to(device)
+            inputX.requires_grad = True
+            self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+            self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
 
     def test_fit_positive(self):
-        X = np.random.randn(BSZ, DIM)
-        y = np.random.randn(BSZ, 1)
-
-        ref = linear_model.Lasso(fit_intercept=False, positive=True)
-        ref.fit(X, y)
-        ref_preds = ref.predict(X)
-
-        model = ml.linear_model.Lasso(fit_intercept=False, positive=True)
-        model.fit(torch.from_numpy(X), torch.from_numpy(y))
-        model_preds = model.predict(torch.from_numpy(X))
-        model_forward = model(torch.from_numpy(X))
-
-        self.assertTrue(
-            np.allclose(ref_preds, model_preds[0].detach().numpy().flatten(), atol=1e-3)
-        )
-        self.assertTrue(
-            np.allclose(
-                ref_preds, model_forward[0].detach().numpy().flatten(), atol=1e-3
+        for i in range(2):
+            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            X = np.random.randn(BSZ, DIM)
+            y = np.random.randn(BSZ, 1)
+
+            ref = linear_model.Lasso(fit_intercept=False, positive=True)
+            ref.fit(X, y)
+            ref_preds = ref.predict(X)
+
+            model = ml.linear_model.Lasso(fit_intercept=False, positive=True)
+            model.fit(torch.from_numpy(X).to(device), torch.from_numpy(y).to(device))
+            model_preds = model.predict(torch.from_numpy(X).to(device))
+            model_forward = model(torch.from_numpy(X).to(device))
+
+            self.assertTrue(
+                np.allclose(ref_preds, model_preds[0].detach().cpu().numpy().flatten(), atol=1e-3)
+            )
+            self.assertTrue(
+                np.allclose(
+                    ref_preds, model_forward[0].detach().cpu().numpy().flatten(), atol=1e-3
+                )
             )
-        )
 
-        inputX = torch.from_numpy(X)
-        inputX.requires_grad = True
-        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
-        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+            inputX = torch.from_numpy(X).to(device)
+            inputX.requires_grad = True
+            self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+            self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/torchml/linear_model/lasso.py b/torchml/linear_model/lasso.py
index a6ab155..557642b 100644
--- a/torchml/linear_model/lasso.py
+++ b/torchml/linear_model/lasso.py
@@ -81,6 +81,8 @@ def fit(self, X: torch.Tensor, y: torch.Tensor):
 
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
 
+        device = X.device
+
         m, n = X.shape
 
         w = cp.Variable((n, 1))
@@ -120,10 +122,10 @@ def fit(self, X: torch.Tensor, y: torch.Tensor):
         # this object is now callable with pytorch tensors
         if self.fit_intercept:
             self.weight, self.intercept = fit_lr(
-                X, y, torch.tensor(self.alpha, dtype=torch.float64)
+                X, y, torch.tensor(self.alpha, dtype=torch.float64, device=device)
             )
         else:
-            self.weight = fit_lr(X, y, torch.tensor(self.alpha, dtype=torch.float64))
+            self.weight = fit_lr(X, y, torch.tensor(self.alpha, dtype=torch.float64, device=device))
         self.weight = torch.stack(list(self.weight), dim=0)
 
     def predict(self, X: torch.Tensor):

From a5d433623f0205bdf384130754ade025e0ea413b Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Sun, 13 Nov 2022 20:43:07 -0800
Subject: [PATCH 28/31] add gpu support for ridge and linear regression

---
 .../linear_model/linear_regression_test.py    | 40 ++++++-----
 tests/unit/linear_model/ridge_test.py         | 72 ++++++++++---------
 torchml/linear_model/ridge.py                 |  8 ++-
 3 files changed, 64 insertions(+), 56 deletions(-)

diff --git a/tests/unit/linear_model/linear_regression_test.py b/tests/unit/linear_model/linear_regression_test.py
index 7eab44e..c056daa 100644
--- a/tests/unit/linear_model/linear_regression_test.py
+++ b/tests/unit/linear_model/linear_regression_test.py
@@ -12,25 +12,27 @@
 
 class TestLinearRegression(unittest.TestCase):
     def test_fit(self):
-        X = np.random.randn(BSZ, DIM)
-        y = np.random.randn(BSZ, 1)
-
-        ref = linear_model.LinearRegression(fit_intercept=False)
-        ref.fit(X, y)
-        ref_preds = ref.predict(X)
-
-        model = ml.linear_model.LinearRegression(fit_intercept=False)
-        model.fit(torch.from_numpy(X), torch.from_numpy(y))
-        model_preds = model.predict(torch.from_numpy(X))
-        model_forward = model(torch.from_numpy(X))
-
-        self.assertTrue(np.allclose(ref_preds, model_preds.numpy()))
-        self.assertTrue(np.allclose(ref_preds, model_forward.numpy()))
-
-        inputX = torch.from_numpy(X)
-        inputX.requires_grad = True
-        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
-        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+        for i in range(2):
+            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            X = np.random.randn(BSZ, DIM)
+            y = np.random.randn(BSZ, 1)
+
+            ref = linear_model.LinearRegression(fit_intercept=False)
+            ref.fit(X, y)
+            ref_preds = ref.predict(X)
+
+            model = ml.linear_model.LinearRegression(fit_intercept=False)
+            model.fit(torch.from_numpy(X).to(device), torch.from_numpy(y).to(device))
+            model_preds = model.predict(torch.from_numpy(X).to(device))
+            model_forward = model(torch.from_numpy(X).to(device))
+
+            self.assertTrue(np.allclose(ref_preds, model_preds.cpu().numpy()))
+            self.assertTrue(np.allclose(ref_preds, model_forward.cpu().numpy()))
+
+            inputX = torch.from_numpy(X).to(device)
+            inputX.requires_grad = True
+            self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+            self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/linear_model/ridge_test.py b/tests/unit/linear_model/ridge_test.py
index 30c257e..41f2c08 100644
--- a/tests/unit/linear_model/ridge_test.py
+++ b/tests/unit/linear_model/ridge_test.py
@@ -12,46 +12,50 @@
 
 class TestRidge(unittest.TestCase):
     def test_fit(self):
-        X = np.random.randn(BSZ, DIM)
-        y = np.random.randn(BSZ, 1)
+        for i in range(2):
+            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            X = np.random.randn(BSZ, DIM)
+            y = np.random.randn(BSZ, 1)
 
-        ref = linear_model.Ridge(fit_intercept=False)
-        ref.fit(X, y)
-        ref_preds = ref.predict(X)
+            ref = linear_model.Ridge(fit_intercept=False)
+            ref.fit(X, y)
+            ref_preds = ref.predict(X)
 
-        model = ml.linear_model.Ridge(fit_intercept=False)
-        model.fit(torch.from_numpy(X), torch.from_numpy(y))
-        model_preds = model.predict(torch.from_numpy(X))
-        model_forward = model(torch.from_numpy(X))
+            model = ml.linear_model.Ridge(fit_intercept=False)
+            model.fit(torch.from_numpy(X).to(device), torch.from_numpy(y).to(device))
+            model_preds = model.predict(torch.from_numpy(X).to(device))
+            model_forward = model(torch.from_numpy(X).to(device))
 
-        self.assertTrue(np.allclose(ref_preds, model_preds.numpy()))
-        self.assertTrue(np.allclose(ref_preds, model_forward.numpy()))
+            self.assertTrue(np.allclose(ref_preds, model_preds.cpu().numpy()))
+            self.assertTrue(np.allclose(ref_preds, model_forward.cpu().numpy()))
 
-        inputX = torch.from_numpy(X)
-        inputX.requires_grad = True
-        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
-        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+            inputX = torch.from_numpy(X).to(device)
+            inputX.requires_grad = True
+            self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+            self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
 
     def test_fit_intercept(self):
-        X = np.random.randn(BSZ, DIM)
-        y = np.random.randn(BSZ, 1)
-
-        ref = linear_model.Ridge(fit_intercept=True)
-        ref.fit(X, y)
-        ref_preds = ref.predict(X)
-
-        model = ml.linear_model.Ridge(fit_intercept=True)
-        model.fit(torch.from_numpy(X), torch.from_numpy(y))
-        model_preds = model.predict(torch.from_numpy(X))
-        model_forward = model(torch.from_numpy(X))
-
-        self.assertTrue(np.allclose(ref_preds, model_preds.numpy()))
-        self.assertTrue(np.allclose(ref_preds, model_forward.numpy()))
-
-        inputX = torch.from_numpy(X)
-        inputX.requires_grad = True
-        self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
-        self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
+        for i in range(2):
+            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            X = np.random.randn(BSZ, DIM)
+            y = np.random.randn(BSZ, 1)
+
+            ref = linear_model.Ridge(fit_intercept=True)
+            ref.fit(X, y)
+            ref_preds = ref.predict(X)
+
+            model = ml.linear_model.Ridge(fit_intercept=True)
+            model.fit(torch.from_numpy(X).to(device), torch.from_numpy(y).to(device))
+            model_preds = model.predict(torch.from_numpy(X).to(device))
+            model_forward = model(torch.from_numpy(X).to(device))
+
+            self.assertTrue(np.allclose(ref_preds, model_preds.cpu().numpy()))
+            self.assertTrue(np.allclose(ref_preds, model_forward.cpu().numpy()))
+
+            inputX = torch.from_numpy(X).to(device)
+            inputX.requires_grad = True
+            self.assertTrue(gradcheck(model.predict, inputX, eps=1e-6, atol=1e-3))
+            self.assertTrue(gradcheck(model, inputX, eps=1e-6, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/torchml/linear_model/ridge.py b/torchml/linear_model/ridge.py
index 1c43555..cace70b 100644
--- a/torchml/linear_model/ridge.py
+++ b/torchml/linear_model/ridge.py
@@ -80,14 +80,16 @@ def fit(self, X: torch.Tensor, y: torch.Tensor):
         """
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
 
+        device = X.device
+
         if self.fit_intercept:
-            X = torch.cat([torch.ones(X.shape[0], 1), X], dim=1)
+            X = torch.cat([torch.ones(X.shape[0], 1, device=device), X], dim=1)
 
         # L2 penalty term will not apply when alpha is 0
         if self.alpha == 0:
             self.weight = torch.pinverse(X.T @ X) @ X.T @ y
         else:
-            ridge = self.alpha * torch.eye(X.shape[1])
+            ridge = self.alpha * torch.eye(X.shape[1], device=device)
             # intercept term is not penalized when fit_intercept is true
             if self.fit_intercept:
                 ridge[0][0] = 0
@@ -112,5 +114,5 @@ def predict(self, X: torch.Tensor):
         ~~~
         """
         if self.fit_intercept:
-            X = torch.cat([torch.ones(X.shape[0], 1), X], dim=1)
+            X = torch.cat([torch.ones(X.shape[0], 1, device=X.device), X], dim=1)
         return X @ self.weight

From 38b44000022f26028e84f6f2273605262d8a6234 Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Sun, 13 Nov 2022 20:58:54 -0800
Subject: [PATCH 29/31] add gpu for neighbors

---
 .../neighbors/k_neighbors_classifier_test.py  | 64 ++++++++++---------
 .../unit/neighbors/nearest_centroids_test.py  | 35 +++++-----
 .../unit/neighbors/nearest_neighbors_test.py  | 62 +++++++++---------
 torchml/neighbors/k_neighbors_classifier.py   | 32 ++++++----
 torchml/neighbors/nearest_centroid.py         |  7 +-
 5 files changed, 108 insertions(+), 92 deletions(-)

diff --git a/tests/unit/neighbors/k_neighbors_classifier_test.py b/tests/unit/neighbors/k_neighbors_classifier_test.py
index 7333fbc..1ea9782 100644
--- a/tests/unit/neighbors/k_neighbors_classifier_test.py
+++ b/tests/unit/neighbors/k_neighbors_classifier_test.py
@@ -12,37 +12,39 @@
 
 class TestkneighborsClassifier(unittest.TestCase):
     def test_knn_classifier(self):
-        for i in range(1, 5, 1):
-            X = np.random.randn(BSZ, DIM)
-            y = np.random.randint(low=-100, high=100, size=BSZ)
-            p = np.random.randn(5, DIM)
-
-            ref = neighbors.KNeighborsClassifier(
-                weights="distance" if i % 2 else "uniform", p=i
-            )
-            ref.fit(X, y)
-            refr = ref.predict(p)
-            refp = ref.predict_proba(p)
-
-            test = ml.neighbors.KNeighborsClassifier(
-                weights="distance" if i % 2 else "uniform", p=i
-            )
-            test.fit(torch.from_numpy(X), torch.from_numpy(y))
-            inputP = torch.from_numpy(p)
-            inputP.requires_grad = True
-            
-            testr = test.predict(torch.from_numpy(p))
-            testp = test.predict_proba(torch.from_numpy(p))
-            self.assertTrue(gradcheck(test.predict, inputP, eps=1e-6, atol=1e-3))
-            # self.assertTrue(gradcheck(test.predict_proba, inputP, eps=1e-20, atol=1e-3))
-            self.assertTrue(np.allclose(refr, testr.numpy()))
-            self.assertTrue(np.allclose(refp, testp.numpy()))
-
-            refr2 = ref.kneighbors(p)
-            testr2 = test.kneighbors(torch.from_numpy(p))
-            self.assertTrue(gradcheck(test.kneighbors, inputP, eps=1e-6, atol=1e-3))
-            self.assertTrue(np.allclose(refr2[0], testr2[0].numpy()))
-            self.assertTrue(np.allclose(refr2[1], testr2[1].numpy()))
+        for i in range(2):
+            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            for i in range(1, 5, 1):
+                X = np.random.randn(BSZ, DIM)
+                y = np.random.randint(low=-100, high=100, size=BSZ)
+                p = np.random.randn(5, DIM)
+
+                ref = neighbors.KNeighborsClassifier(
+                    weights="distance" if i % 2 else "uniform", p=i
+                )
+                ref.fit(X, y)
+                refr = ref.predict(p)
+                refp = ref.predict_proba(p)
+
+                test = ml.neighbors.KNeighborsClassifier(
+                    weights="distance" if i % 2 else "uniform", p=i
+                )
+                test.fit(torch.from_numpy(X).to(device), torch.from_numpy(y).to(device))
+                inputP = torch.from_numpy(p).to(device)
+                inputP.requires_grad = True
+
+                testr = test.predict(torch.from_numpy(p).to(device))
+                testp = test.predict_proba(torch.from_numpy(p).to(device))
+                self.assertTrue(gradcheck(test.predict, inputP, eps=1e-6, atol=1e-3))
+                # self.assertTrue(gradcheck(test.predict_proba, inputP, eps=1e-20, atol=1e-3))
+                self.assertTrue(np.allclose(refr, testr.cpu().numpy()))
+                self.assertTrue(np.allclose(refp, testp.cpu().numpy()))
+
+                refr2 = ref.kneighbors(p)
+                testr2 = test.kneighbors(torch.from_numpy(p).to(device))
+                self.assertTrue(gradcheck(test.kneighbors, inputP, eps=1e-6, atol=1e-3))
+                self.assertTrue(np.allclose(refr2[0], testr2[0].cpu().numpy()))
+                self.assertTrue(np.allclose(refr2[1], testr2[1].cpu().numpy()))
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/neighbors/nearest_centroids_test.py b/tests/unit/neighbors/nearest_centroids_test.py
index 5f2b35b..6db9653 100644
--- a/tests/unit/neighbors/nearest_centroids_test.py
+++ b/tests/unit/neighbors/nearest_centroids_test.py
@@ -13,23 +13,24 @@
 
 class Testcentroids(unittest.TestCase):
     def test_kneighbors(self):
-
-        for i in range(100):
-            X = np.random.randn(SAMPLES, FEA)
-            y = np.random.randint(1, CLS, size=SAMPLES)
-            torchX = torch.from_numpy(X)
-            torchy = torch.from_numpy(y)
-            ref = neighbors.NearestCentroid()
-            cent = ml.neighbors.NearestCentroid()
-            ref.fit(X, y)
-            cent.fit(torchX, torchy)
-            samp = np.random.randn(SAMPLES, FEA)
-            refres = ref.predict(samp)
-            centres = cent.predict(torch.from_numpy(samp)).numpy()
-            self.assertTrue(np.array_equal(refres, centres))
-            inputSamp = torch.from_numpy(samp)
-            inputSamp.requires_grad = True
-            self.assertTrue(gradcheck(cent.predict, inputSamp, eps=1e-6, atol=1e-3))
+        for i in range(2):
+            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            for i in range(100):
+                X = np.random.randn(SAMPLES, FEA)
+                y = np.random.randint(1, CLS, size=SAMPLES)
+                torchX = torch.from_numpy(X).to(device)
+                torchy = torch.from_numpy(y).to(device)
+                ref = neighbors.NearestCentroid()
+                cent = ml.neighbors.NearestCentroid()
+                ref.fit(X, y)
+                cent.fit(torchX, torchy)
+                samp = np.random.randn(SAMPLES, FEA)
+                refres = ref.predict(samp)
+                centres = cent.predict(torch.from_numpy(samp).to(device)).cpu().numpy()
+                self.assertTrue(np.array_equal(refres, centres))
+                inputSamp = torch.from_numpy(samp).to(device)
+                inputSamp.requires_grad = True
+                self.assertTrue(gradcheck(cent.predict, inputSamp, eps=1e-6, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/neighbors/nearest_neighbors_test.py b/tests/unit/neighbors/nearest_neighbors_test.py
index ea274fe..13317d5 100644
--- a/tests/unit/neighbors/nearest_neighbors_test.py
+++ b/tests/unit/neighbors/nearest_neighbors_test.py
@@ -10,36 +10,38 @@
 
 
 class Testkneighbors(unittest.TestCase):
-    def test_kneighbors(self):
-        for i in range(1, 5, 1):
-            X = np.random.randn(BSZ, DIM)
-            y = np.random.randn(5, DIM)
-            ref = neighbors.NearestNeighbors(p=i)
-            ref.fit(X)
-            test = ref.kneighbors(y)
-
-            model = ml.neighbors.NearestNeighbors(p=i)
-            model.fit(torch.from_numpy(X))
-            res = model.kneighbors(torch.from_numpy(y))
-
-            # return distance is true
-            self.assertTrue(np.allclose(test[0], res[0].numpy()))
-            self.assertTrue(np.allclose(test[1], res[1].numpy()))
-            inputY = torch.from_numpy(y)
-            inputY.requires_grad = True
-            self.assertTrue(gradcheck(model.kneighbors, inputY, eps=1e-6, atol=1e-3))
-
-            ref = neighbors.NearestNeighbors(p=i)
-            ref.fit(X)
-            test = ref.kneighbors(y, return_distance=False)
-
-            model = ml.neighbors.NearestNeighbors(p=i)
-            model.fit(torch.from_numpy(X))
-            res = model.kneighbors(torch.from_numpy(y), return_distance=False)
-
-            # return distance is false
-            self.assertTrue(np.allclose(test, res.numpy()))
-            self.assertTrue(gradcheck(model.kneighbors, inputY, eps=1e-6, atol=1e-3))
+    def test_kneighbors_classifier(self):
+        for i in range(2):
+            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            for i in range(1, 5, 1):
+                X = np.random.randn(BSZ, DIM)
+                y = np.random.randn(5, DIM)
+                ref = neighbors.NearestNeighbors(p=i)
+                ref.fit(X)
+                test = ref.kneighbors(y)
+
+                model = ml.neighbors.NearestNeighbors(p=i)
+                model.fit(torch.from_numpy(X).to(device))
+                res = model.kneighbors(torch.from_numpy(y).to(device))
+
+                # return distance is true
+                self.assertTrue(np.allclose(test[0], res[0].cpu().numpy()))
+                self.assertTrue(np.allclose(test[1], res[1].cpu().numpy()))
+                inputY = torch.from_numpy(y).to(device)
+                inputY.requires_grad = True
+                self.assertTrue(gradcheck(model.kneighbors, inputY, eps=1e-6, atol=1e-3))
+
+                ref = neighbors.NearestNeighbors(p=i)
+                ref.fit(X)
+                test = ref.kneighbors(y, return_distance=False)
+
+                model = ml.neighbors.NearestNeighbors(p=i)
+                model.fit(torch.from_numpy(X).to(device))
+                res = model.kneighbors(torch.from_numpy(y).to(device), return_distance=False)
+
+                # return distance is false
+                self.assertTrue(np.allclose(test, res.cpu().numpy()))
+                self.assertTrue(gradcheck(model.kneighbors, inputY, eps=1e-6, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/torchml/neighbors/k_neighbors_classifier.py b/torchml/neighbors/k_neighbors_classifier.py
index ed56256..62d94a9 100644
--- a/torchml/neighbors/k_neighbors_classifier.py
+++ b/torchml/neighbors/k_neighbors_classifier.py
@@ -1,7 +1,10 @@
 import numbers
 import warnings
+from typing import Tuple, Any
 
 import torch
+from torch import Tensor
+
 import torchml as ml
 
 
@@ -107,6 +110,7 @@ def fit(self, X: torch.Tensor, y: torch.Tensor):
         """
         self.KNN.fit(X)
         self.weights = self._check_weights(weights=self.weights)
+        device = X.device
         if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
             if y.ndim != 1:
                 warnings.warn(
@@ -122,7 +126,7 @@ def fit(self, X: torch.Tensor, y: torch.Tensor):
             self.outputs_2d_ = True
 
         self.classes_ = []
-        self._y = torch.empty(size=y.shape, dtype=torch.long)
+        self._y = torch.empty(size=y.shape, dtype=torch.long, device=device)
         for k in range(self._y.shape[1]):
             classes, self._y[:, k] = torch.unique(y[:, k], return_inverse=True)
             self.classes_.append(classes)
@@ -141,6 +145,7 @@ def predict(self, X: torch.Tensor) -> torch.Tensor:
 
         * `X` (torch.Tensor): the target point
         """
+        device = X.device
         if self.weights == "uniform":
             neigh_ind = self.KNN.kneighbors(X, return_distance=False)
             neigh_dist = None
@@ -157,7 +162,7 @@ def predict(self, X: torch.Tensor) -> torch.Tensor:
         n_queries = len(X)
         weights = self._get_weights(neigh_dist, self.weights)
 
-        y_pred = torch.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
+        y_pred = torch.empty((n_queries, n_outputs), dtype=classes_[0].dtype, device=device)
 
         for k, classes_k in enumerate(classes_):
             if weights is None:
@@ -182,6 +187,7 @@ def predict_proba(self, X: torch.Tensor) -> torch.Tensor:
 
         * `X` (torch.Tensor): the target point
         """
+        device = X.device
         if self.weights == "uniform":
             neigh_ind = self.KNN.kneighbors(X, return_distance=False)
             neigh_dist = None
@@ -198,13 +204,13 @@ def predict_proba(self, X: torch.Tensor) -> torch.Tensor:
 
         weights = self._get_weights(neigh_dist, self.weights)
         if weights is None:
-            weights = torch.ones_like(neigh_ind)
+            weights = torch.ones_like(neigh_ind, device=device)
 
         all_rows = torch.arange(n_queries)
         probabilities = []
         for k, classes_k in enumerate(classes_):
             pred_labels = _y[:, k][neigh_ind]
-            proba_k = torch.zeros((n_queries, len(classes_k)))
+            proba_k = torch.zeros((n_queries, len(classes_k)), device=device)
 
             for i, idx in enumerate(pred_labels.T):
                 proba_k[all_rows, idx] += weights[:, i]
@@ -264,21 +270,23 @@ def _get_weights(self, dist: torch.Tensor, weights: str) -> torch.Tensor:
                 "'distance', or a callable function"
             )
 
-    def _weighted_mode(self, a: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
-        res = torch.empty(0)
-        resi = torch.empty(0)
+    def _weighted_mode(self, a: torch.Tensor, w: torch.Tensor) -> tuple[Tensor | Any, Tensor | Any]:
+        device = a.device
+        res = torch.empty(0, device=device)
+        resi = torch.empty(0, device=device)
         for i, x in enumerate(a):
             res1 = self._weighted_mode_util(x, w)
-            res = torch.cat((res, torch.tensor([res1[0]])))
-            resi = torch.cat((resi, torch.tensor([res1[1]])))
+            res = torch.cat((res, torch.tensor([res1[0]], device=device)))
+            resi = torch.cat((resi, torch.tensor([res1[1]], device=device)))
         return res, resi
 
-    def _weighted_mode_util(self, a: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
+    def _weighted_mode_util(self, a: torch.Tensor, w: torch.Tensor) -> tuple[Any, Tensor]:
+        device = a.device
         unique_a = torch.unique(a)
-        res = torch.empty(0)
+        res = torch.empty(0, device=device)
         for i, x in enumerate(unique_a):
             cleared = (a == x).float()
             cleared_weights = cleared * w
             sum = torch.sum(cleared_weights)
-            res = torch.cat((res, torch.tensor([sum])))
+            res = torch.cat((res, torch.tensor([sum], device=device)))
         return unique_a[torch.argmax(res)], torch.max(res)
diff --git a/torchml/neighbors/nearest_centroid.py b/torchml/neighbors/nearest_centroid.py
index d6d7ae3..7d7ee23 100644
--- a/torchml/neighbors/nearest_centroid.py
+++ b/torchml/neighbors/nearest_centroid.py
@@ -64,6 +64,8 @@ def fit(self, X: torch.Tensor, y: torch.Tensor):
         * `y` (torch.Tensor): array-like of shape (n_samples,) Target values
         """
 
+        device = X.device
+
         n_samples, n_features = X.shape
 
         # y_ind: idx, y_classes: unique tensor
@@ -79,7 +81,7 @@ def fit(self, X: torch.Tensor, y: torch.Tensor):
 
         # Mask mapping each class to its members.
         self.centroids_ = torch.empty(
-            (n_classes, n_features), dtype=X.dtype, device=torch.device("cpu")
+            (n_classes, n_features), dtype=X.dtype, device=device
         )
         # Number of clusters in each class.
 
@@ -109,10 +111,11 @@ def predict(self, X: torch.tensor) -> torch.tensor:
         * (torch.Tensor): the predicted classes
 
         """
+        device = X.device
         if X is None or X.size(dim=0) < 1:
             print("Warning: check input size")
 
-        ret = torch.empty(X.size(dim=0))
+        ret = torch.empty(X.size(dim=0), device=device)
 
         for i in range(X.size(dim=0)):
             ret[i] = self.classes_[

From f02797217a343924a9472938ad05b12ce38868cb Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Sun, 13 Nov 2022 22:46:27 -0800
Subject: [PATCH 30/31] add gpu support

---
 tests/unit/linear_model/lasso_test.py         | 36 ++++++--
 .../linear_model/linear_regression_test.py    |  2 +-
 tests/unit/linear_model/ridge_test.py         |  4 +-
 .../neighbors/k_neighbors_classifier_test.py  |  2 +-
 .../unit/neighbors/nearest_centroids_test.py  |  2 +-
 .../unit/neighbors/nearest_neighbors_test.py  | 14 ++-
 tests/unit/svm/linear_svc_test.py             | 90 +++++++++++--------
 tests/unit/svm/linear_svr_test.py             | 66 ++++++++------
 torchml/linear_model/lasso.py                 |  4 +-
 torchml/neighbors/k_neighbors_classifier.py   | 12 ++-
 torchml/svm/linear_svc.py                     | 10 ++-
 torchml/svm/linear_svr.py                     |  4 +-
 12 files changed, 150 insertions(+), 96 deletions(-)

diff --git a/tests/unit/linear_model/lasso_test.py b/tests/unit/linear_model/lasso_test.py
index f88ed25..4ed6094 100644
--- a/tests/unit/linear_model/lasso_test.py
+++ b/tests/unit/linear_model/lasso_test.py
@@ -13,7 +13,7 @@
 class TestLasso(unittest.TestCase):
     def test_fit(self):
         for i in range(2):
-            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            device = torch.device("cuda" if torch.cuda.is_available() and i else "cpu")
 
             X = np.random.randn(BSZ, DIM)
             y = np.random.randn(BSZ, 1)
@@ -28,11 +28,17 @@ def test_fit(self):
             model_forward = model(torch.from_numpy(X).to(device))
 
             self.assertTrue(
-                np.allclose(ref_preds, model_preds[0].detach().cpu().numpy().flatten(), atol=1e-3)
+                np.allclose(
+                    ref_preds,
+                    model_preds[0].detach().cpu().numpy().flatten(),
+                    atol=1e-3,
+                )
             )
             self.assertTrue(
                 np.allclose(
-                    ref_preds, model_forward[0].detach().cpu().numpy().flatten(), atol=1e-3
+                    ref_preds,
+                    model_forward[0].detach().cpu().numpy().flatten(),
+                    atol=1e-3,
                 )
             )
 
@@ -43,7 +49,7 @@ def test_fit(self):
 
     def test_fit_intercept(self):
         for i in range(2):
-            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            device = torch.device("cuda" if torch.cuda.is_available() and i else "cpu")
             X = np.random.randn(BSZ, DIM)
             y = np.random.randn(BSZ, 1)
 
@@ -57,11 +63,17 @@ def test_fit_intercept(self):
             model_forward = model(torch.from_numpy(X).to(device))
 
             self.assertTrue(
-                np.allclose(ref_preds, model_preds[0].detach().cpu().numpy().flatten(), atol=1e-3)
+                np.allclose(
+                    ref_preds,
+                    model_preds[0].detach().cpu().numpy().flatten(),
+                    atol=1e-3,
+                )
             )
             self.assertTrue(
                 np.allclose(
-                    ref_preds, model_forward[0].detach().cpu().numpy().flatten(), atol=1e-3
+                    ref_preds,
+                    model_forward[0].detach().cpu().numpy().flatten(),
+                    atol=1e-3,
                 )
             )
 
@@ -72,7 +84,7 @@ def test_fit_intercept(self):
 
     def test_fit_positive(self):
         for i in range(2):
-            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            device = torch.device("cuda" if torch.cuda.is_available() and i else "cpu")
             X = np.random.randn(BSZ, DIM)
             y = np.random.randn(BSZ, 1)
 
@@ -86,11 +98,17 @@ def test_fit_positive(self):
             model_forward = model(torch.from_numpy(X).to(device))
 
             self.assertTrue(
-                np.allclose(ref_preds, model_preds[0].detach().cpu().numpy().flatten(), atol=1e-3)
+                np.allclose(
+                    ref_preds,
+                    model_preds[0].detach().cpu().numpy().flatten(),
+                    atol=1e-3,
+                )
             )
             self.assertTrue(
                 np.allclose(
-                    ref_preds, model_forward[0].detach().cpu().numpy().flatten(), atol=1e-3
+                    ref_preds,
+                    model_forward[0].detach().cpu().numpy().flatten(),
+                    atol=1e-3,
                 )
             )
 
diff --git a/tests/unit/linear_model/linear_regression_test.py b/tests/unit/linear_model/linear_regression_test.py
index c056daa..ed4c201 100644
--- a/tests/unit/linear_model/linear_regression_test.py
+++ b/tests/unit/linear_model/linear_regression_test.py
@@ -13,7 +13,7 @@
 class TestLinearRegression(unittest.TestCase):
     def test_fit(self):
         for i in range(2):
-            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            device = torch.device("cuda" if torch.cuda.is_available() and i else "cpu")
             X = np.random.randn(BSZ, DIM)
             y = np.random.randn(BSZ, 1)
 
diff --git a/tests/unit/linear_model/ridge_test.py b/tests/unit/linear_model/ridge_test.py
index 41f2c08..cd78111 100644
--- a/tests/unit/linear_model/ridge_test.py
+++ b/tests/unit/linear_model/ridge_test.py
@@ -13,7 +13,7 @@
 class TestRidge(unittest.TestCase):
     def test_fit(self):
         for i in range(2):
-            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            device = torch.device("cuda" if torch.cuda.is_available() and i else "cpu")
             X = np.random.randn(BSZ, DIM)
             y = np.random.randn(BSZ, 1)
 
@@ -36,7 +36,7 @@ def test_fit(self):
 
     def test_fit_intercept(self):
         for i in range(2):
-            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            device = torch.device("cuda" if torch.cuda.is_available() and i else "cpu")
             X = np.random.randn(BSZ, DIM)
             y = np.random.randn(BSZ, 1)
 
diff --git a/tests/unit/neighbors/k_neighbors_classifier_test.py b/tests/unit/neighbors/k_neighbors_classifier_test.py
index 1ea9782..719cae3 100644
--- a/tests/unit/neighbors/k_neighbors_classifier_test.py
+++ b/tests/unit/neighbors/k_neighbors_classifier_test.py
@@ -13,7 +13,7 @@
 class TestkneighborsClassifier(unittest.TestCase):
     def test_knn_classifier(self):
         for i in range(2):
-            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            device = torch.device("cuda" if torch.cuda.is_available() and i else "cpu")
             for i in range(1, 5, 1):
                 X = np.random.randn(BSZ, DIM)
                 y = np.random.randint(low=-100, high=100, size=BSZ)
diff --git a/tests/unit/neighbors/nearest_centroids_test.py b/tests/unit/neighbors/nearest_centroids_test.py
index 6db9653..bcbe872 100644
--- a/tests/unit/neighbors/nearest_centroids_test.py
+++ b/tests/unit/neighbors/nearest_centroids_test.py
@@ -14,7 +14,7 @@
 class Testcentroids(unittest.TestCase):
     def test_kneighbors(self):
         for i in range(2):
-            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            device = torch.device("cuda" if torch.cuda.is_available() and i else "cpu")
             for i in range(100):
                 X = np.random.randn(SAMPLES, FEA)
                 y = np.random.randint(1, CLS, size=SAMPLES)
diff --git a/tests/unit/neighbors/nearest_neighbors_test.py b/tests/unit/neighbors/nearest_neighbors_test.py
index 13317d5..a0cec86 100644
--- a/tests/unit/neighbors/nearest_neighbors_test.py
+++ b/tests/unit/neighbors/nearest_neighbors_test.py
@@ -12,7 +12,7 @@
 class Testkneighbors(unittest.TestCase):
     def test_kneighbors_classifier(self):
         for i in range(2):
-            device = torch.device('cuda' if torch.cuda.is_available() and i else 'cpu')
+            device = torch.device("cuda" if torch.cuda.is_available() and i else "cpu")
             for i in range(1, 5, 1):
                 X = np.random.randn(BSZ, DIM)
                 y = np.random.randn(5, DIM)
@@ -29,7 +29,9 @@ def test_kneighbors_classifier(self):
                 self.assertTrue(np.allclose(test[1], res[1].cpu().numpy()))
                 inputY = torch.from_numpy(y).to(device)
                 inputY.requires_grad = True
-                self.assertTrue(gradcheck(model.kneighbors, inputY, eps=1e-6, atol=1e-3))
+                self.assertTrue(
+                    gradcheck(model.kneighbors, inputY, eps=1e-6, atol=1e-3)
+                )
 
                 ref = neighbors.NearestNeighbors(p=i)
                 ref.fit(X)
@@ -37,11 +39,15 @@ def test_kneighbors_classifier(self):
 
                 model = ml.neighbors.NearestNeighbors(p=i)
                 model.fit(torch.from_numpy(X).to(device))
-                res = model.kneighbors(torch.from_numpy(y).to(device), return_distance=False)
+                res = model.kneighbors(
+                    torch.from_numpy(y).to(device), return_distance=False
+                )
 
                 # return distance is false
                 self.assertTrue(np.allclose(test, res.cpu().numpy()))
-                self.assertTrue(gradcheck(model.kneighbors, inputY, eps=1e-6, atol=1e-3))
+                self.assertTrue(
+                    gradcheck(model.kneighbors, inputY, eps=1e-6, atol=1e-3)
+                )
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/svm/linear_svc_test.py b/tests/unit/svm/linear_svc_test.py
index 3561794..93d70a6 100644
--- a/tests/unit/svm/linear_svc_test.py
+++ b/tests/unit/svm/linear_svc_test.py
@@ -9,53 +9,65 @@
 from torchml.svm import LinearSVC
 
 n_samples = 5000
-n_features = 5
-n_classes = 5
-n_informative = 5
+n_features = 4
+n_classes = 2
+n_informative = 4
 
 
 class TestLinearSVC(unittest.TestCase):
     def test_LinearSVC(self):
-        x, y = make_classification(
-            n_samples=n_samples,
-            n_features=n_features,
-            n_classes=n_classes,
-            n_informative=n_informative,
-            n_redundant=n_features - n_informative,
-        )
-        lsvc = LinearSVC(max_iter=1000)
-        start = time.time()
-        lsvc.fit(torch.from_numpy(x), torch.from_numpy(y))
-        end = time.time()
-        # print(end - start)
-        start = time.time()
-        reflsvc = svm.LinearSVC(max_iter=100000)
-        reflsvc.fit(x, y)
-
-        end = time.time()
-        # print(end - start)
-        self.assertTrue(np.allclose(lsvc.coef_.numpy(), reflsvc.coef_, atol=1e-2))
-        self.assertTrue(
-            np.allclose(lsvc.intercept_.numpy(), reflsvc.intercept_, atol=1e-2)
-        )
-        self.assertTrue(
-            np.allclose(
-                lsvc.decision_function(torch.from_numpy(x)),
-                reflsvc.decision_function(x),
-                atol=1e-2,
+        for i in range(2):
+            device = torch.device("cuda" if torch.cuda.is_available and i else "cpu")
+            x, y = make_classification(
+                n_samples=n_samples,
+                n_features=n_features,
+                n_classes=n_classes,
+                n_informative=n_informative,
+                n_redundant=n_features - n_informative,
             )
-        )
+            lsvc = LinearSVC(max_iter=1000)
+            start = time.time()
+            lsvc.fit(torch.from_numpy(x).to(device), torch.from_numpy(y).to(device))
+            end = time.time()
+            # print(end - start)
+            start = time.time()
+            reflsvc = svm.LinearSVC(max_iter=100000)
+            reflsvc.fit(x, y)
 
-        inputX = torch.from_numpy(x)
-        inputX.requires_grad = True
-        self.assertTrue(gradcheck(lsvc.decision_function, inputX, eps=1e-6, atol=1e-3))
+            end = time.time()
+            # print(end - start)
+            self.assertTrue(
+                np.allclose(lsvc.coef_.cpu().numpy(), reflsvc.coef_, atol=1e-2)
+            )
+            self.assertTrue(
+                np.allclose(
+                    lsvc.intercept_.cpu().numpy(), reflsvc.intercept_, atol=1e-2
+                )
+            )
+            self.assertTrue(
+                np.allclose(
+                    lsvc.decision_function(torch.from_numpy(x).to(device))
+                    .cpu()
+                    .numpy(),
+                    reflsvc.decision_function(x),
+                    atol=1e-2,
+                )
+            )
+
+            inputX = torch.from_numpy(x).to(device)
+            inputX.requires_grad = True
+            self.assertTrue(
+                gradcheck(lsvc.decision_function, inputX, eps=1e-6, atol=1e-3)
+            )
 
-        self.assertTrue(
-            np.allclose(
-                lsvc.predict(torch.from_numpy(x)), reflsvc.predict(x), atol=1e-2
+            self.assertTrue(
+                np.allclose(
+                    lsvc.predict(torch.from_numpy(x).to(device)).cpu().numpy(),
+                    reflsvc.predict(x),
+                    atol=1e-2,
+                )
             )
-        )
-        self.assertTrue(gradcheck(lsvc.predict, inputX, eps=1e-6, atol=1e-3))
+            self.assertTrue(gradcheck(lsvc.predict, inputX, eps=1e-6, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/svm/linear_svr_test.py b/tests/unit/svm/linear_svr_test.py
index e6b0857..c8a5e9a 100644
--- a/tests/unit/svm/linear_svr_test.py
+++ b/tests/unit/svm/linear_svr_test.py
@@ -9,41 +9,49 @@
 from torchml.svm import LinearSVR
 
 n_samples = 5000
-n_features = 10
-n_informative = 7
+n_features = 4
+n_informative = 3
 
 
 class TestLinearSVR(unittest.TestCase):
     def test_LinearSVR(self):
-        x, y = make_regression(
-            n_samples=n_samples,
-            n_features=n_features,
-            n_informative=n_informative,
-        )
-        lsvr = LinearSVR(max_iter=1000)
-        start = time.time()
-        lsvr.fit(torch.from_numpy(x), torch.from_numpy(y))
-        end = time.time()
-        # print(end - start)
-        start = time.time()
-        reflsvr = svm.LinearSVR(max_iter=100000)
-        reflsvr.fit(x, y)
-
-        end = time.time()
-        # print(end - start)
-        self.assertTrue(np.allclose(lsvr.coef_.numpy(), reflsvr.coef_, atol=1e-2))
-        self.assertTrue(
-            np.allclose(lsvr.intercept_.numpy(), reflsvr.intercept_, atol=1e-2)
-        )
-        self.assertTrue(
-            np.allclose(
-                lsvr.predict(torch.from_numpy(x)), reflsvr.predict(x), atol=1e-2
+        for i in range(2):
+            device = torch.device("cuda" if torch.cuda.is_available and i else "cpu")
+            x, y = make_regression(
+                n_samples=n_samples,
+                n_features=n_features,
+                n_informative=n_informative,
+            )
+            lsvr = LinearSVR(max_iter=1000)
+            start = time.time()
+            lsvr.fit(torch.from_numpy(x).to(device), torch.from_numpy(y).to(device))
+            end = time.time()
+            # print(end - start)
+            start = time.time()
+            reflsvr = svm.LinearSVR(max_iter=100000)
+            reflsvr.fit(x, y)
+
+            end = time.time()
+            # print(end - start)
+            self.assertTrue(
+                np.allclose(lsvr.coef_.cpu().numpy(), reflsvr.coef_, atol=1e-2)
+            )
+            self.assertTrue(
+                np.allclose(
+                    lsvr.intercept_.cpu().numpy(), reflsvr.intercept_, atol=1e-2
+                )
+            )
+            self.assertTrue(
+                np.allclose(
+                    lsvr.predict(torch.from_numpy(x).to(device)).cpu().numpy(),
+                    reflsvr.predict(x),
+                    atol=1e-2,
+                )
             )
-        )
 
-        inputX = torch.from_numpy(x)
-        inputX.requires_grad = True
-        self.assertTrue(gradcheck(lsvr.predict, inputX, eps=1e-6, atol=1e-3))
+            inputX = torch.from_numpy(x).to(device)
+            inputX.requires_grad = True
+            self.assertTrue(gradcheck(lsvr.predict, inputX, eps=1e-6, atol=1e-3))
 
 
 if __name__ == "__main__":
diff --git a/torchml/linear_model/lasso.py b/torchml/linear_model/lasso.py
index 557642b..5ebe4eb 100644
--- a/torchml/linear_model/lasso.py
+++ b/torchml/linear_model/lasso.py
@@ -125,7 +125,9 @@ def fit(self, X: torch.Tensor, y: torch.Tensor):
                 X, y, torch.tensor(self.alpha, dtype=torch.float64, device=device)
             )
         else:
-            self.weight = fit_lr(X, y, torch.tensor(self.alpha, dtype=torch.float64, device=device))
+            self.weight = fit_lr(
+                X, y, torch.tensor(self.alpha, dtype=torch.float64, device=device)
+            )
         self.weight = torch.stack(list(self.weight), dim=0)
 
     def predict(self, X: torch.Tensor):
diff --git a/torchml/neighbors/k_neighbors_classifier.py b/torchml/neighbors/k_neighbors_classifier.py
index 62d94a9..9a97053 100644
--- a/torchml/neighbors/k_neighbors_classifier.py
+++ b/torchml/neighbors/k_neighbors_classifier.py
@@ -162,7 +162,9 @@ def predict(self, X: torch.Tensor) -> torch.Tensor:
         n_queries = len(X)
         weights = self._get_weights(neigh_dist, self.weights)
 
-        y_pred = torch.empty((n_queries, n_outputs), dtype=classes_[0].dtype, device=device)
+        y_pred = torch.empty(
+            (n_queries, n_outputs), dtype=classes_[0].dtype, device=device
+        )
 
         for k, classes_k in enumerate(classes_):
             if weights is None:
@@ -270,7 +272,9 @@ def _get_weights(self, dist: torch.Tensor, weights: str) -> torch.Tensor:
                 "'distance', or a callable function"
             )
 
-    def _weighted_mode(self, a: torch.Tensor, w: torch.Tensor) -> tuple[Tensor | Any, Tensor | Any]:
+    def _weighted_mode(
+        self, a: torch.Tensor, w: torch.Tensor
+    ) -> tuple[Tensor | Any, Tensor | Any]:
         device = a.device
         res = torch.empty(0, device=device)
         resi = torch.empty(0, device=device)
@@ -280,7 +284,9 @@ def _weighted_mode(self, a: torch.Tensor, w: torch.Tensor) -> tuple[Tensor | Any
             resi = torch.cat((resi, torch.tensor([res1[1]], device=device)))
         return res, resi
 
-    def _weighted_mode_util(self, a: torch.Tensor, w: torch.Tensor) -> tuple[Any, Tensor]:
+    def _weighted_mode_util(
+        self, a: torch.Tensor, w: torch.Tensor
+    ) -> tuple[Any, Tensor]:
         device = a.device
         unique_a = torch.unique(a)
         res = torch.empty(0, device=device)
diff --git a/torchml/svm/linear_svc.py b/torchml/svm/linear_svc.py
index ff9614e..127f4f1 100644
--- a/torchml/svm/linear_svc.py
+++ b/torchml/svm/linear_svc.py
@@ -123,11 +123,12 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         """
         if self.C < 0:
             raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
+        device = X.device
         self.classes_ = torch.unique(y)
         assert X.shape[0] == y.shape[0], "Number of X and y rows don't match"
         m, n = X.shape
-        self.coef_ = torch.empty((0, n))
-        self.intercept_ = torch.empty((0))
+        self.coef_ = torch.empty((0, n), device=device)
+        self.intercept_ = torch.empty((0), device=device)
         if self.classes_.shape[0] == 2:
             self._fit_with_one_class(
                 X, y, self.classes_[1], sample_weight=sample_weight
@@ -169,6 +170,7 @@ def predict(self, X: torch.Tensor) -> torch.Tensor:
     def _fit_with_one_class(
         self, X: torch.Tensor, y: torch.Tensor, fitting_class: any, sample_weight=None
     ):
+        device = X.device
         m, n = X.shape
 
         y = torch.unsqueeze(y, 1)
@@ -186,9 +188,9 @@ def _fit_with_one_class(
         loss = cp.multiply((1 / 2.0), cp.norm(w, 2))
 
         if self.fit_intercept:
-            hinge = cp.pos(ones - cp.multiply(y, X_param @ w + b))
+            hinge = cp.pos(ones - cp.multiply(y.cpu(), X_param @ w + b))
         else:
-            hinge = cp.pos(ones - cp.multiply(y, X_param @ w))
+            hinge = cp.pos(ones - cp.multiply(y.cpu(), X_param @ w))
 
         if self.loss == "squared_hinge":
             loss += cp.multiply(self.C, cp.sum(cp.square(hinge)))
diff --git a/torchml/svm/linear_svr.py b/torchml/svm/linear_svr.py
index 913390e..0e2c67b 100644
--- a/torchml/svm/linear_svr.py
+++ b/torchml/svm/linear_svr.py
@@ -125,9 +125,9 @@ def fit(self, X: torch.Tensor, y: torch.Tensor, sample_weight=None):
         loss = cp.multiply((1 / 2.0), cp.norm(w, 2))
 
         if self.fit_intercept:
-            hinge = cp.pos(cp.abs(y - (X_param @ w + b)) - self.epsilon)
+            hinge = cp.pos(cp.abs(y.cpu() - (X_param @ w + b)) - self.epsilon)
         else:
-            hinge = cp.pos(cp.abs(y - (X_param @ w + b)) - self.epsilon)
+            hinge = cp.pos(cp.abs(y.cpu() - (X_param @ w + b)) - self.epsilon)
 
         if self.loss == "epsilon_insensitive":
             loss += self.C * cp.sum(cp.square(hinge))

From c629c32a4e41c326af55fb2cfa0f295de5809c4c Mon Sep 17 00:00:00 2001
From: David Zhang <210057zzh@gmail.com>
Date: Sun, 13 Nov 2022 22:53:07 -0800
Subject: [PATCH 31/31] add gpu support

---
 tests/unit/neighbors/k_neighbors_classifier_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/unit/neighbors/k_neighbors_classifier_test.py b/tests/unit/neighbors/k_neighbors_classifier_test.py
index 719cae3..84f5d13 100644
--- a/tests/unit/neighbors/k_neighbors_classifier_test.py
+++ b/tests/unit/neighbors/k_neighbors_classifier_test.py
@@ -30,13 +30,12 @@ def test_knn_classifier(self):
                     weights="distance" if i % 2 else "uniform", p=i
                 )
                 test.fit(torch.from_numpy(X).to(device), torch.from_numpy(y).to(device))
-                inputP = torch.from_numpy(p).to(device)
+                inputP = torch.from_numpy(p).to(device).double()
                 inputP.requires_grad = True
 
                 testr = test.predict(torch.from_numpy(p).to(device))
                 testp = test.predict_proba(torch.from_numpy(p).to(device))
                 self.assertTrue(gradcheck(test.predict, inputP, eps=1e-6, atol=1e-3))
-                # self.assertTrue(gradcheck(test.predict_proba, inputP, eps=1e-20, atol=1e-3))
                 self.assertTrue(np.allclose(refr, testr.cpu().numpy()))
                 self.assertTrue(np.allclose(refp, testp.cpu().numpy()))