From 42f67368340f7e1b50e7b7c1cac4429867b7755f Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Sun, 12 Mar 2023 18:47:27 +0100 Subject: [PATCH 01/52] FISTA CPU --- skglm/gpu/cpu_solver.py | 56 +++++++++++++++++++++++++++++++++ skglm/gpu/tests/test_utils.py | 15 +++++++++ skglm/gpu/utils/__init__.py | 0 skglm/gpu/utils/device_utils.py | 0 skglm/gpu/utils/host_utils.py | 33 +++++++++++++++++++ 5 files changed, 104 insertions(+) create mode 100644 skglm/gpu/cpu_solver.py create mode 100644 skglm/gpu/tests/test_utils.py create mode 100644 skglm/gpu/utils/__init__.py create mode 100644 skglm/gpu/utils/device_utils.py create mode 100644 skglm/gpu/utils/host_utils.py diff --git a/skglm/gpu/cpu_solver.py b/skglm/gpu/cpu_solver.py new file mode 100644 index 000000000..e960629f8 --- /dev/null +++ b/skglm/gpu/cpu_solver.py @@ -0,0 +1,56 @@ +import numpy as np + +from skglm.utils.prox_funcs import ST_vec +from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit + + +class CPUSolver: + + def __init__(self, max_iter=1000, verbose=0): + self.max_iter = max_iter + self.verbose = verbose + + def solve(self, X, y, lmbd): + n_samples, n_features = X.shape + + # compute step + lipschitz = np.linalg.norm(X, ord=2) ** 2 + if lipschitz == 0.: + return np.zeros(n_features) + + step = 1 / lipschitz + + # init vars in device + w = np.zeros(n_features) + old_w = np.zeros(n_features) + mid_w = np.zeros(n_features) + grad = np.zeros(n_features) + + t_old, t_new = 1, 1 + + for it in range(self.max_iter): + + # compute grad + grad = X.T @ (X @ mid_w - y) + + # forward / backward + mid_w = mid_w - step * grad + w = ST_vec(mid_w, step * lmbd) + + if self.verbose: + p_obj = compute_obj(X, y, lmbd, w) + opt_crit = eval_opt_crit(X, y, lmbd, w) + + print( + f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" + ) + + # extrapolate + mid_w = w + ((t_old - 1) / t_new) * (w - old_w) + + # update FISTA vars + t_old = t_new + t_new = (1 + np.sqrt(1 + 4 * t_old ** 2)) / 2 + old_w = np.copy(w) + + return w diff --git a/skglm/gpu/tests/test_utils.py b/skglm/gpu/tests/test_utils.py new file mode 100644 index 000000000..63da7e3cf --- /dev/null +++ b/skglm/gpu/tests/test_utils.py @@ -0,0 +1,15 @@ +import numpy as np +from skglm.gpu.utils.host_utils import compute_obj + + +def test_compute_obj(): + + # generate dummy data + X = np.eye(3) + y = np.array([1, 0, 1]) + w = np.array([1, 2, -3]) + lmbd = 10. + + p_obj = compute_obj(X, y, lmbd, w) + + np.testing.assert_array_equal(p_obj, 0.5 * 20 + 10. * 6) diff --git a/skglm/gpu/utils/__init__.py b/skglm/gpu/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/skglm/gpu/utils/device_utils.py b/skglm/gpu/utils/device_utils.py new file mode 100644 index 000000000..e69de29bb diff --git a/skglm/gpu/utils/host_utils.py b/skglm/gpu/utils/host_utils.py new file mode 100644 index 000000000..659a8de39 --- /dev/null +++ b/skglm/gpu/utils/host_utils.py @@ -0,0 +1,33 @@ +import numpy as np +from numpy.linalg import norm + +from numba import njit + + +def compute_obj(X, y, lmbd, w): + return 0.5 * norm(y - X @ w) ** 2 + lmbd * norm(w, ord=1) + + +def eval_opt_crit(X, y, lmbd, w): + grad = X.T @ (X @ w - y) + opt_crit = _compute_dist_subdiff(w, grad, lmbd) + + return opt_crit + + +@njit("f8(f8[:], f8[:], f8)") +def _compute_dist_subdiff(w, grad, lmbd): + max_dist = 0. + + for i in range(len(w)): + grad_i = grad[i] + w_i = w[i] + + if w[i] == 0.: + dist = max(abs(grad_i) - lmbd, 0.) + else: + dist = abs(grad_i + np.sign(w_i) * lmbd) + + max_dist = max(max_dist, dist) + + return max_dist From 116dda1448b232f9564f8dd1bb32a4d497fec825 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Sun, 12 Mar 2023 18:47:41 +0100 Subject: [PATCH 02/52] cupy solver --- skglm/gpu/__init__.py | 6 ++++ skglm/gpu/cupy_solver.py | 66 ++++++++++++++++++++++++++++++++++ skglm/gpu/example.py | 56 +++++++++++++++++++++++++++++ skglm/gpu/tests/test_solver.py | 30 ++++++++++++++++ 4 files changed, 158 insertions(+) create mode 100644 skglm/gpu/__init__.py create mode 100644 skglm/gpu/cupy_solver.py create mode 100644 skglm/gpu/example.py create mode 100644 skglm/gpu/tests/test_solver.py diff --git a/skglm/gpu/__init__.py b/skglm/gpu/__init__.py new file mode 100644 index 000000000..2a6008fdc --- /dev/null +++ b/skglm/gpu/__init__.py @@ -0,0 +1,6 @@ +"""Solve Lasso problem using FISTA GPU-implementation. + +Problem reads:: + + min_w (1/2) * ||y - Xw||^2 + lmbd * ||w||_1 +""" diff --git a/skglm/gpu/cupy_solver.py b/skglm/gpu/cupy_solver.py new file mode 100644 index 000000000..28abed4d0 --- /dev/null +++ b/skglm/gpu/cupy_solver.py @@ -0,0 +1,66 @@ +import cupy as cp +import numpy as np +from numpy.linalg import norm + +from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit + + +class CupySolver: + + def __init__(self, max_iter=1000, verbose=0): + self.max_iter = max_iter + self.verbose = verbose + + def solve(self, X, y, lmbd): + n_samples, n_features = X.shape + + # compute step + lipschitz = np.linalg.norm(X, ord=2) ** 2 + if lipschitz == 0.: + return np.zeros(n_features) + + step = 1 / lipschitz + + # transfer to device + X_gpu = cp.array(X) + y_gpu = cp.array(y) + + # init vars in device + w = cp.zeros(n_features) + old_w = cp.zeros(n_features) + mid_w = cp.zeros(n_features) + grad = cp.zeros(n_features) + + t_old, t_new = 1, 1 + + for it in range(self.max_iter): + + # compute grad + cp.dot(X_gpu.T, X_gpu @ mid_w - y_gpu, out=grad) + + # forward / backward: w = ST(mid_w - step * grad, step * lmbd) + mid_w = mid_w - step * grad + w = cp.sign(mid_w) * cp.maximum(cp.abs(mid_w) - step * lmbd, 0.) + + if self.verbose: + w_cpu = cp.asnumpy(w) + + p_obj = compute_obj(X, y, lmbd, w_cpu) + opt_crit = eval_opt_crit(X, y, lmbd, w_cpu) + + print( + f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" + ) + + # extrapolate + mid_w = w + ((t_old - 1) / t_new) * (w - old_w) + + # update FISTA vars + t_old = t_new + t_new = (1 + cp.sqrt(1 + 4 * t_old ** 2)) / 2 + old_w = cp.copy(w) + + # transfer back to host + w_cpu = cp.asnumpy(w) + + return w_cpu diff --git a/skglm/gpu/example.py b/skglm/gpu/example.py new file mode 100644 index 000000000..c41bb87d1 --- /dev/null +++ b/skglm/gpu/example.py @@ -0,0 +1,56 @@ +import time + +import numpy as np +from numpy.linalg import norm + +from sklearn.linear_model import Lasso +from skglm.gpu.cupy_solver import CupySolver + +from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit + + +random_state = 1265 +n_samples, n_features = 100_000, 300 +reg = 1e-2 + +# generate dummy data +rng = np.random.RandomState(random_state) +X = rng.randn(n_samples, n_features) +y = rng.randn(n_samples) + + +# set lambda +lmbd_max = norm(X.T @ y, ord=np.inf) +lmbd = reg * lmbd_max + + +# solve problem +start = time.perf_counter() +solver = CupySolver(verbose=0) +w_gpu = solver.solve(X, y, lmbd) +end = time.perf_counter() + +print("gpu time: ", end - start) + + +start = time.perf_counter() +estimator = Lasso(alpha=lmbd / n_samples, fit_intercept=False) +estimator.fit(X, y) +end = time.perf_counter() +print("sklearn time: ", end - start) + +w_sk = estimator.coef_ + + +print( + "Objective\n" + f"gpu : {compute_obj(X, y, lmbd, w_gpu):.8f}\n" + f"sklearn: {compute_obj(X, y, lmbd, w_sk):.8f}" +) + + +print( + "Optimality condition\n" + f"gpu : {eval_opt_crit(X, y, lmbd, w_gpu):.8f}\n" + f"sklearn: {eval_opt_crit(X, y, lmbd, w_sk):.8f}" +) diff --git a/skglm/gpu/tests/test_solver.py b/skglm/gpu/tests/test_solver.py new file mode 100644 index 000000000..5ef4a8bcc --- /dev/null +++ b/skglm/gpu/tests/test_solver.py @@ -0,0 +1,30 @@ +import pytest + +import numpy as np +from numpy.linalg import norm + +from skglm.gpu.cpu_solver import CPUSolver +from skglm.gpu.cupy_solver import CupySolver + +from skglm.gpu.utils.host_utils import eval_opt_crit + + +@pytest.mark.parametrize("solver", [CupySolver(), CPUSolver()]) +def test_solves(solver): + random_state = 1265 + n_samples, n_features = 100, 30 + reg = 1e-2 + + # generate dummy data + rng = np.random.RandomState(random_state) + X = rng.randn(n_samples, n_features) + y = rng.randn(n_samples) + + # set lambda + lmbd_max = norm(X.T @ y, ord=np.inf) + lmbd = reg * lmbd_max + + w = solver.solve(X, y, lmbd) + + stop_crit = eval_opt_crit(X, y, lmbd, w) + np.testing.assert_allclose(stop_crit, 0., atol=1e-9) From ff79040e6119b3c3badf4903d568f840a8979605 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 13 Mar 2023 15:49:28 +0100 Subject: [PATCH 03/52] unittest eval optimality condition --- skglm/gpu/tests/test_utils.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/skglm/gpu/tests/test_utils.py b/skglm/gpu/tests/test_utils.py index 63da7e3cf..3df4b5db7 100644 --- a/skglm/gpu/tests/test_utils.py +++ b/skglm/gpu/tests/test_utils.py @@ -1,5 +1,7 @@ import numpy as np -from skglm.gpu.utils.host_utils import compute_obj +from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit + +from sklearn.linear_model import Lasso def test_compute_obj(): @@ -13,3 +15,21 @@ def test_compute_obj(): p_obj = compute_obj(X, y, lmbd, w) np.testing.assert_array_equal(p_obj, 0.5 * 20 + 10. * 6) + + +def test_eval_optimality(): + rng = np.random.RandomState(1235) + n_samples, n_features = 10, 5 + + X = rng.randn(n_samples, n_features) + y = rng.randn(n_samples) + lmbd = 1. + + estimator = Lasso( + alpha=lmbd / n_samples, fit_intercept=False, tol=1e-9 + ).fit(X, y) + + np.testing.assert_allclose( + eval_opt_crit(X, y, lmbd, estimator.coef_), 0., + atol=1e-9 + ) From 1385900fc7d4890c37006369df150d8a3388d8fd Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 13 Mar 2023 15:49:45 +0100 Subject: [PATCH 04/52] cleanups cpu solver --- skglm/gpu/cpu_solver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skglm/gpu/cpu_solver.py b/skglm/gpu/cpu_solver.py index e960629f8..2c8f6a92f 100644 --- a/skglm/gpu/cpu_solver.py +++ b/skglm/gpu/cpu_solver.py @@ -6,7 +6,7 @@ class CPUSolver: - def __init__(self, max_iter=1000, verbose=0): + def __init__(self, max_iter=1000, verbose=0): self.max_iter = max_iter self.verbose = verbose @@ -20,7 +20,7 @@ def solve(self, X, y, lmbd): step = 1 / lipschitz - # init vars in device + # init vars w = np.zeros(n_features) old_w = np.zeros(n_features) mid_w = np.zeros(n_features) From e63cdb790d16406c708643d298749abebf3adbe5 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 13 Mar 2023 16:03:33 +0100 Subject: [PATCH 05/52] jax solver --- skglm/gpu/example.py | 10 ++++- skglm/gpu/jax_solver.py | 89 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 skglm/gpu/jax_solver.py diff --git a/skglm/gpu/example.py b/skglm/gpu/example.py index c41bb87d1..3cf7af912 100644 --- a/skglm/gpu/example.py +++ b/skglm/gpu/example.py @@ -5,12 +5,13 @@ from sklearn.linear_model import Lasso from skglm.gpu.cupy_solver import CupySolver +from skglm.gpu.jax_solver import JaxSolver from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit random_state = 1265 -n_samples, n_features = 100_000, 300 +n_samples, n_features = 100, 30 reg = 1e-2 # generate dummy data @@ -23,10 +24,15 @@ lmbd_max = norm(X.T @ y, ord=np.inf) lmbd = reg * lmbd_max +solver = JaxSolver(verbose=1, use_auto_diff=False) + +# cache grad +solver.max_iter = 2 +solver.solve(X, y, lmbd) # solve problem start = time.perf_counter() -solver = CupySolver(verbose=0) +solver.max_iter = 1000 w_gpu = solver.solve(X, y, lmbd) end = time.perf_counter() diff --git a/skglm/gpu/jax_solver.py b/skglm/gpu/jax_solver.py new file mode 100644 index 000000000..ac9c23692 --- /dev/null +++ b/skglm/gpu/jax_solver.py @@ -0,0 +1,89 @@ +# if not set, raises an error related to CUDA linking API. +# as recommended, setting the 'XLA_FLAGS' to bypass it. +# side-effect: (perhaps) slow compilation time. +import os +os.environ['XLA_FLAGS'] = '--xla_gpu_force_compilation_parallelism=1' # noqa + +import numpy as np + +import jax +import jax.numpy as jnp +# set float64 as default float type. +# if not, amplifies rounding errors. +jax.config.update("jax_enable_x64", True) # noqa + +from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit + + +class JaxSolver: + + def __init__(self, max_iter=1000, use_auto_diff=True, verbose=0): + self.max_iter = max_iter + self.use_auto_diff = use_auto_diff + self.verbose = verbose + + def solve(self, X, y, lmbd): + n_samples, n_features = X.shape + + # compute step + lipschitz = np.linalg.norm(X, ord=2) ** 2 + if lipschitz == 0.: + return np.zeros(n_features) + + step = 1 / lipschitz + + # transfer to device + X_gpu = jnp.asarray(X) + y_gpu = jnp.asarray(y) + + # get grad func of datafit + if self.use_auto_diff: + grad_quad_loss = jax.grad(_quad_loss) + + # init vars in device + w = jnp.zeros(n_features) + old_w = jnp.zeros(n_features) + mid_w = jnp.zeros(n_features) + grad = jnp.zeros(n_features) + + t_old, t_new = 1, 1 + + for it in range(self.max_iter): + + # compute grad + if self.use_auto_diff: + grad = grad_quad_loss(mid_w, X_gpu, y_gpu) + else: + grad = jnp.dot(X_gpu.T, jnp.dot(X_gpu, mid_w) - y_gpu) + + # forward / backward + mid_w = mid_w - step * grad + w = jnp.sign(mid_w) * jnp.maximum(jnp.abs(mid_w) - step * lmbd, 0.) + + if self.verbose: + w_cpu = np.asarray(w, dtype=np.float64) + + p_obj = compute_obj(X, y, lmbd, w_cpu) + opt_crit = eval_opt_crit(X, y, lmbd, w_cpu) + + print( + f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" + ) + + # extrapolate + mid_w = w + ((t_old - 1) / t_new) * (w - old_w) + + # update FISTA vars + t_old = t_new + t_new = 0.5 * (1 + jnp.sqrt(1. + 4. * t_old ** 2)) + old_w = jnp.copy(w) + + # transfer back to host + w_cpu = np.asarray(w, dtype=np.float64) + + return w_cpu + + +def _quad_loss(w, X_gpu, y_gpu): + pred_y = jnp.dot(X_gpu, w) + return 0.5 * jnp.sum((y_gpu - pred_y) ** 2) From d49291c83223c610d2a342fde583a554014f9da0 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 13 Mar 2023 16:03:45 +0100 Subject: [PATCH 06/52] add unittest jax solver --- skglm/gpu/tests/test_solver.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/skglm/gpu/tests/test_solver.py b/skglm/gpu/tests/test_solver.py index 5ef4a8bcc..104a36df7 100644 --- a/skglm/gpu/tests/test_solver.py +++ b/skglm/gpu/tests/test_solver.py @@ -5,11 +5,15 @@ from skglm.gpu.cpu_solver import CPUSolver from skglm.gpu.cupy_solver import CupySolver +from skglm.gpu.jax_solver import JaxSolver from skglm.gpu.utils.host_utils import eval_opt_crit -@pytest.mark.parametrize("solver", [CupySolver(), CPUSolver()]) +@pytest.mark.parametrize("solver", [CupySolver(), + CPUSolver(), + JaxSolver(use_auto_diff=False), + JaxSolver(use_auto_diff=True)]) def test_solves(solver): random_state = 1265 n_samples, n_features = 100, 30 From ec8c6638ada0b5d6d15105fd35b787686c454cfd Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 13 Mar 2023 16:15:52 +0100 Subject: [PATCH 07/52] pass flake8 --- skglm/gpu/cupy_solver.py | 1 - skglm/gpu/example.py | 1 - skglm/gpu/jax_solver.py | 8 ++++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/skglm/gpu/cupy_solver.py b/skglm/gpu/cupy_solver.py index 28abed4d0..153ca3c1c 100644 --- a/skglm/gpu/cupy_solver.py +++ b/skglm/gpu/cupy_solver.py @@ -1,6 +1,5 @@ import cupy as cp import numpy as np -from numpy.linalg import norm from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit diff --git a/skglm/gpu/example.py b/skglm/gpu/example.py index 3cf7af912..6e2ba08e2 100644 --- a/skglm/gpu/example.py +++ b/skglm/gpu/example.py @@ -4,7 +4,6 @@ from numpy.linalg import norm from sklearn.linear_model import Lasso -from skglm.gpu.cupy_solver import CupySolver from skglm.gpu.jax_solver import JaxSolver from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit diff --git a/skglm/gpu/jax_solver.py b/skglm/gpu/jax_solver.py index ac9c23692..54a21bda0 100644 --- a/skglm/gpu/jax_solver.py +++ b/skglm/gpu/jax_solver.py @@ -4,15 +4,15 @@ import os os.environ['XLA_FLAGS'] = '--xla_gpu_force_compilation_parallelism=1' # noqa -import numpy as np +import numpy as np # noqa -import jax -import jax.numpy as jnp +import jax # noqa +import jax.numpy as jnp # noqa # set float64 as default float type. # if not, amplifies rounding errors. jax.config.update("jax_enable_x64", True) # noqa -from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit +from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit # noqa class JaxSolver: From b703d25baa59293141fbf95ab8cf8926b668fae8 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 13 Mar 2023 18:56:01 +0100 Subject: [PATCH 08/52] numba solver layout --- skglm/gpu/numba_solver.py | 97 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 skglm/gpu/numba_solver.py diff --git a/skglm/gpu/numba_solver.py b/skglm/gpu/numba_solver.py new file mode 100644 index 000000000..7574ffe1e --- /dev/null +++ b/skglm/gpu/numba_solver.py @@ -0,0 +1,97 @@ +import math +import numpy as np +from numba import cuda + + +from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit + + +# max number of threads that one block could contain +# https://forums.developer.nvidia.com/t/maximum-number-of-threads-on-thread-block/46392 +N_THREADS = 1024 + + +class NumbaSolver: + + def __init__(self, max_iter=1000, verbose=0): + self.max_iter = max_iter + self.verbose = verbose + + def solve(self, X, y, lmbd): + n_samples, n_features = X.shape + + # compute step + lipschitz = np.linalg.norm(X, ord=2) ** 2 + if lipschitz == 0.: + return np.zeros(n_features) + + step = 1 / lipschitz + + # number of block to use along each axis when calling kernel + n_blocks_axis_0, n_blocks_axis_1 = [math.ceil(n / N_THREADS) for n in X.shape] + + # transfer to device + X_gpu = cuda.as_cuda_array(X) + y_gpu = cuda.as_cuda_array(y) + + # init vars on device + w = cuda.device_array(n_features) + old_w = cuda.device_array(n_features) + mid_w = cuda.device_array(n_features) + + grad = cuda.device_array(n_features) + minus_residual = cuda.device_array(n_samples) + + t_old, t_new = 1, 1 + + for it in range(self.max_iter): + + __compute_minus_residual[n_blocks_axis_0, N_THREADS]( + X_gpu, y_gpu, w, out=minus_residual) + + __compute_grad[n_blocks_axis_1, N_THREADS]( + X_gpu, minus_residual, out=grad) + + __forward_backward[n_blocks_axis_1, N_THREADS]( + mid_w, grad, step, lmbd, out=w) + + if self.verbose: + p_obj = compute_obj(X, y, lmbd, w) + opt_crit = eval_opt_crit(X, y, lmbd, w) + + print( + f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" + ) + + # extrapolate + coef = (t_old - 1) / t_new + __extrapolate[n_blocks_axis_1, N_THREADS](w, old_w, coef, out=mid_w) + + # update FISTA vars + t_old = t_new + t_new = (1 + np.sqrt(1 + 4 * t_old ** 2)) / 2 + # in `copy_to_device`: `self` is destination and `other` is source + old_w.copy_to_device(w) + + return w + + +def __compute_minus_residual(X_gpu, y_gpu, w, out): + # X_gpu @ w - y_gpu + pass + + +def __compute_grad(X_gpu, minus_residual, out): + # X.T @ minus_residual + pass + + +def __forward_backward(): + # mid_w = mid_w - step * grad + # w = ST_vec(mid_w, step * lmbd) + pass + + +def __extrapolate(w, old_w, coef, out): + # mid_w = w + ((t_old - 1) / t_new) * (w - old_w) + pass From c82e35239b09bc2c524d238eb9c5a3f5827f081f Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 13 Mar 2023 22:05:28 +0100 Subject: [PATCH 09/52] numba cuda utils --- skglm/gpu/numba_solver.py | 69 +++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 10 deletions(-) diff --git a/skglm/gpu/numba_solver.py b/skglm/gpu/numba_solver.py index 7574ffe1e..0c2ba0432 100644 --- a/skglm/gpu/numba_solver.py +++ b/skglm/gpu/numba_solver.py @@ -76,22 +76,71 @@ def solve(self, X, y, lmbd): return w +@cuda.jit def __compute_minus_residual(X_gpu, y_gpu, w, out): - # X_gpu @ w - y_gpu - pass + # compute: out = X_gpu @ w - y_gpu + i = cuda.grid(1) + n_samples, n_features = X_gpu.shape + if i >= n_samples: + return + tmp = 0. + for j in range(n_features): + tmp += X_gpu[i, j] * w[j] + tmp += y_gpu[i] + + out[i] = tmp + + +@cuda.jit def __compute_grad(X_gpu, minus_residual, out): - # X.T @ minus_residual - pass + # compute: out=X.T @ minus_residual + j = cuda.grid(1) + + n_samples, n_features = X_gpu.shape + if j >= n_features: + return + + tmp = 0. + for i in range(n_samples): + tmp += X_gpu[i, j] * minus_residual[i] + + out[j] = tmp -def __forward_backward(): - # mid_w = mid_w - step * grad - # w = ST_vec(mid_w, step * lmbd) - pass +@cuda.jit +def __forward_backward(mid_w, grad, step, lmbd, out): + # forward: mid_w = mid_w - step * grad + # backward: w = ST_vec(mid_w, step * lmbd) + j = cuda.grid(1) + n_features = len(mid_w) + if j >= n_features: + return + # forward + tmp = mid_w[j] - step * grad[j] + + # backward + level = step * lmbd + if abs(tmp) <= level: + tmp = 0. + elif tmp > level: + tmp = tmp - level + else: + tmp = tmp + level + + out[j] = tmp + + +@cuda.jit def __extrapolate(w, old_w, coef, out): - # mid_w = w + ((t_old - 1) / t_new) * (w - old_w) - pass + # compute: out = w + coef * (w - old_w) + j = cuda.grid(1) + + n_features = len(w) + if j >= n_features: + return + + out[j] = w[j] + coef * (w[j] - old_w[j]) From 8f579316e1ba3c740a91252aca42f26109448c3f Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 13 Mar 2023 22:52:52 +0100 Subject: [PATCH 10/52] fix numba solver --- skglm/gpu/numba_solver.py | 43 +++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/skglm/gpu/numba_solver.py b/skglm/gpu/numba_solver.py index 0c2ba0432..bc01996ac 100644 --- a/skglm/gpu/numba_solver.py +++ b/skglm/gpu/numba_solver.py @@ -31,8 +31,8 @@ def solve(self, X, y, lmbd): n_blocks_axis_0, n_blocks_axis_1 = [math.ceil(n / N_THREADS) for n in X.shape] # transfer to device - X_gpu = cuda.as_cuda_array(X) - y_gpu = cuda.as_cuda_array(y) + X_gpu = cuda.to_device(X) + y_gpu = cuda.to_device(y) # init vars on device w = cuda.device_array(n_features) @@ -46,18 +46,23 @@ def solve(self, X, y, lmbd): for it in range(self.max_iter): - __compute_minus_residual[n_blocks_axis_0, N_THREADS]( - X_gpu, y_gpu, w, out=minus_residual) + # inplace update of minus residual + _compute_minus_residual[n_blocks_axis_0, N_THREADS]( + X_gpu, y_gpu, w, minus_residual) - __compute_grad[n_blocks_axis_1, N_THREADS]( - X_gpu, minus_residual, out=grad) + # inplace update of grad + _compute_grad[n_blocks_axis_1, N_THREADS]( + X_gpu, minus_residual, grad) - __forward_backward[n_blocks_axis_1, N_THREADS]( - mid_w, grad, step, lmbd, out=w) + # inplace update of w + _forward_backward[n_blocks_axis_1, N_THREADS]( + mid_w, grad, step, lmbd, w) if self.verbose: - p_obj = compute_obj(X, y, lmbd, w) - opt_crit = eval_opt_crit(X, y, lmbd, w) + w_cpu = w.copy_to_host() + + p_obj = compute_obj(X, y, lmbd, w_cpu) + opt_crit = eval_opt_crit(X, y, lmbd, w_cpu) print( f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" @@ -65,7 +70,8 @@ def solve(self, X, y, lmbd): # extrapolate coef = (t_old - 1) / t_new - __extrapolate[n_blocks_axis_1, N_THREADS](w, old_w, coef, out=mid_w) + _extrapolate[n_blocks_axis_1, N_THREADS]( + w, old_w, coef, mid_w) # update FISTA vars t_old = t_new @@ -73,11 +79,14 @@ def solve(self, X, y, lmbd): # in `copy_to_device`: `self` is destination and `other` is source old_w.copy_to_device(w) - return w + # transfer back to host + w_cpu = w.copy_to_host() + + return w_cpu @cuda.jit -def __compute_minus_residual(X_gpu, y_gpu, w, out): +def _compute_minus_residual(X_gpu, y_gpu, w, out): # compute: out = X_gpu @ w - y_gpu i = cuda.grid(1) @@ -88,13 +97,13 @@ def __compute_minus_residual(X_gpu, y_gpu, w, out): tmp = 0. for j in range(n_features): tmp += X_gpu[i, j] * w[j] - tmp += y_gpu[i] + tmp -= y_gpu[i] out[i] = tmp @cuda.jit -def __compute_grad(X_gpu, minus_residual, out): +def _compute_grad(X_gpu, minus_residual, out): # compute: out=X.T @ minus_residual j = cuda.grid(1) @@ -110,7 +119,7 @@ def __compute_grad(X_gpu, minus_residual, out): @cuda.jit -def __forward_backward(mid_w, grad, step, lmbd, out): +def _forward_backward(mid_w, grad, step, lmbd, out): # forward: mid_w = mid_w - step * grad # backward: w = ST_vec(mid_w, step * lmbd) j = cuda.grid(1) @@ -135,7 +144,7 @@ def __forward_backward(mid_w, grad, step, lmbd, out): @cuda.jit -def __extrapolate(w, old_w, coef, out): +def _extrapolate(w, old_w, coef, out): # compute: out = w + coef * (w - old_w) j = cuda.grid(1) From 68c0b716076d88cfc6c0acfc943c1f6f1e17c9c8 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 13 Mar 2023 22:53:02 +0100 Subject: [PATCH 11/52] unittest numba solver --- skglm/gpu/example.py | 11 +++-------- skglm/gpu/tests/test_solver.py | 10 ++++++++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/skglm/gpu/example.py b/skglm/gpu/example.py index 6e2ba08e2..2388ff34e 100644 --- a/skglm/gpu/example.py +++ b/skglm/gpu/example.py @@ -4,13 +4,13 @@ from numpy.linalg import norm from sklearn.linear_model import Lasso -from skglm.gpu.jax_solver import JaxSolver +from skglm.gpu.numba_solver import NumbaSolver from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit random_state = 1265 -n_samples, n_features = 100, 30 +n_samples, n_features = 10 * 1020, 100 reg = 1e-2 # generate dummy data @@ -23,15 +23,10 @@ lmbd_max = norm(X.T @ y, ord=np.inf) lmbd = reg * lmbd_max -solver = JaxSolver(verbose=1, use_auto_diff=False) - -# cache grad -solver.max_iter = 2 -solver.solve(X, y, lmbd) +solver = NumbaSolver(verbose=0) # solve problem start = time.perf_counter() -solver.max_iter = 1000 w_gpu = solver.solve(X, y, lmbd) end = time.perf_counter() diff --git a/skglm/gpu/tests/test_solver.py b/skglm/gpu/tests/test_solver.py index 104a36df7..444304b36 100644 --- a/skglm/gpu/tests/test_solver.py +++ b/skglm/gpu/tests/test_solver.py @@ -6,6 +6,7 @@ from skglm.gpu.cpu_solver import CPUSolver from skglm.gpu.cupy_solver import CupySolver from skglm.gpu.jax_solver import JaxSolver +from skglm.gpu.numba_solver import NumbaSolver from skglm.gpu.utils.host_utils import eval_opt_crit @@ -13,7 +14,8 @@ @pytest.mark.parametrize("solver", [CupySolver(), CPUSolver(), JaxSolver(use_auto_diff=False), - JaxSolver(use_auto_diff=True)]) + JaxSolver(use_auto_diff=True), + NumbaSolver(max_iter=5_000)]) def test_solves(solver): random_state = 1265 n_samples, n_features = 100, 30 @@ -31,4 +33,8 @@ def test_solves(solver): w = solver.solve(X, y, lmbd) stop_crit = eval_opt_crit(X, y, lmbd, w) - np.testing.assert_allclose(stop_crit, 0., atol=1e-9) + + # Numba GPU is not that precise + # needs investigation: (n threads and blocks), dtype? + atol = 1e-4 if isinstance(solver, NumbaSolver) else 1e-9 + np.testing.assert_allclose(stop_crit, 0., atol=atol) From 0677519c40ffe0fb04d5ced873ea101815a91a2a Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Tue, 14 Mar 2023 10:49:50 +0100 Subject: [PATCH 12/52] numba solver example --- skglm/gpu/example.py | 17 +++++++++-------- skglm/gpu/numba_solver.py | 4 ++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/skglm/gpu/example.py b/skglm/gpu/example.py index 2388ff34e..fe8499eb4 100644 --- a/skglm/gpu/example.py +++ b/skglm/gpu/example.py @@ -3,14 +3,14 @@ import numpy as np from numpy.linalg import norm -from sklearn.linear_model import Lasso from skglm.gpu.numba_solver import NumbaSolver +from skglm.gpu.cpu_solver import CPUSolver from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit random_state = 1265 -n_samples, n_features = 10 * 1020, 100 +n_samples, n_features = 10_000, 500 reg = 1e-2 # generate dummy data @@ -24,33 +24,34 @@ lmbd = reg * lmbd_max solver = NumbaSolver(verbose=0) +solver.max_iter = 10 +solver.solve(X, y, lmbd) # solve problem start = time.perf_counter() +solver.max_iter = 1000 w_gpu = solver.solve(X, y, lmbd) end = time.perf_counter() print("gpu time: ", end - start) +solver_cpu = CPUSolver() start = time.perf_counter() -estimator = Lasso(alpha=lmbd / n_samples, fit_intercept=False) -estimator.fit(X, y) +w_cpu = solver_cpu.solve(X, y, lmbd) end = time.perf_counter() print("sklearn time: ", end - start) -w_sk = estimator.coef_ - print( "Objective\n" f"gpu : {compute_obj(X, y, lmbd, w_gpu):.8f}\n" - f"sklearn: {compute_obj(X, y, lmbd, w_sk):.8f}" + f"cpu : {compute_obj(X, y, lmbd, w_cpu):.8f}" ) print( "Optimality condition\n" f"gpu : {eval_opt_crit(X, y, lmbd, w_gpu):.8f}\n" - f"sklearn: {eval_opt_crit(X, y, lmbd, w_sk):.8f}" + f"cpu : {eval_opt_crit(X, y, lmbd, w_cpu):.8f}" ) diff --git a/skglm/gpu/numba_solver.py b/skglm/gpu/numba_solver.py index bc01996ac..5bc9695c7 100644 --- a/skglm/gpu/numba_solver.py +++ b/skglm/gpu/numba_solver.py @@ -28,7 +28,7 @@ def solve(self, X, y, lmbd): step = 1 / lipschitz # number of block to use along each axis when calling kernel - n_blocks_axis_0, n_blocks_axis_1 = [math.ceil(n / N_THREADS) for n in X.shape] + n_blocks_axis_0, n_blocks_axis_1 = (math.ceil(n / N_THREADS) for n in X.shape) # transfer to device X_gpu = cuda.to_device(X) @@ -104,7 +104,7 @@ def _compute_minus_residual(X_gpu, y_gpu, w, out): @cuda.jit def _compute_grad(X_gpu, minus_residual, out): - # compute: out=X.T @ minus_residual + # compute: out = X.T @ minus_residual j = cuda.grid(1) n_samples, n_features = X_gpu.shape From cd686b3546cd598433676e22f8028b57f57cce54 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Tue, 14 Mar 2023 11:10:57 +0100 Subject: [PATCH 13/52] move into solvers folder --- skglm/gpu/example.py | 3 +-- skglm/gpu/solvers/__init__.py | 4 ++++ skglm/gpu/{ => solvers}/cpu_solver.py | 0 skglm/gpu/{ => solvers}/cupy_solver.py | 0 skglm/gpu/{ => solvers}/jax_solver.py | 0 skglm/gpu/{ => solvers}/numba_solver.py | 0 skglm/gpu/tests/test_solver.py | 5 +---- 7 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 skglm/gpu/solvers/__init__.py rename skglm/gpu/{ => solvers}/cpu_solver.py (100%) rename skglm/gpu/{ => solvers}/cupy_solver.py (100%) rename skglm/gpu/{ => solvers}/jax_solver.py (100%) rename skglm/gpu/{ => solvers}/numba_solver.py (100%) diff --git a/skglm/gpu/example.py b/skglm/gpu/example.py index fe8499eb4..8355489d3 100644 --- a/skglm/gpu/example.py +++ b/skglm/gpu/example.py @@ -3,8 +3,7 @@ import numpy as np from numpy.linalg import norm -from skglm.gpu.numba_solver import NumbaSolver -from skglm.gpu.cpu_solver import CPUSolver +from skglm.gpu.solvers import NumbaSolver, CPUSolver from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit diff --git a/skglm/gpu/solvers/__init__.py b/skglm/gpu/solvers/__init__.py new file mode 100644 index 000000000..e1cc3574f --- /dev/null +++ b/skglm/gpu/solvers/__init__.py @@ -0,0 +1,4 @@ +from skglm.gpu.solvers.cpu_solver import CPUSolver # noqa +from skglm.gpu.solvers.cupy_solver import CupySolver # noqa +from skglm.gpu.solvers.jax_solver import JaxSolver # noqa +from skglm.gpu.solvers.numba_solver import NumbaSolver # noqa diff --git a/skglm/gpu/cpu_solver.py b/skglm/gpu/solvers/cpu_solver.py similarity index 100% rename from skglm/gpu/cpu_solver.py rename to skglm/gpu/solvers/cpu_solver.py diff --git a/skglm/gpu/cupy_solver.py b/skglm/gpu/solvers/cupy_solver.py similarity index 100% rename from skglm/gpu/cupy_solver.py rename to skglm/gpu/solvers/cupy_solver.py diff --git a/skglm/gpu/jax_solver.py b/skglm/gpu/solvers/jax_solver.py similarity index 100% rename from skglm/gpu/jax_solver.py rename to skglm/gpu/solvers/jax_solver.py diff --git a/skglm/gpu/numba_solver.py b/skglm/gpu/solvers/numba_solver.py similarity index 100% rename from skglm/gpu/numba_solver.py rename to skglm/gpu/solvers/numba_solver.py diff --git a/skglm/gpu/tests/test_solver.py b/skglm/gpu/tests/test_solver.py index 444304b36..9832a4fd3 100644 --- a/skglm/gpu/tests/test_solver.py +++ b/skglm/gpu/tests/test_solver.py @@ -3,10 +3,7 @@ import numpy as np from numpy.linalg import norm -from skglm.gpu.cpu_solver import CPUSolver -from skglm.gpu.cupy_solver import CupySolver -from skglm.gpu.jax_solver import JaxSolver -from skglm.gpu.numba_solver import NumbaSolver +from skglm.gpu.solvers import CPUSolver, CupySolver, JaxSolver, NumbaSolver from skglm.gpu.utils.host_utils import eval_opt_crit From dd7298ae28270421334c4bb33066a053b85b63fe Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Tue, 14 Mar 2023 11:26:29 +0100 Subject: [PATCH 14/52] add README to install --- skglm/gpu/README.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 skglm/gpu/README.md diff --git a/skglm/gpu/README.md b/skglm/gpu/README.md new file mode 100644 index 000000000..070150ca0 --- /dev/null +++ b/skglm/gpu/README.md @@ -0,0 +1,34 @@ +## Installation + +1. checkout branch +```shell +# add remote if it does't exist (check with: git remote -v) +git remote add Badr-MOUFAD https://github.com/Badr-MOUFAD/skglm.git + +git fetch Badr-MOUFAD skglm-gpu + +git checkout skglm-gpu +``` + +2. create then activate``conda`` environnement +```shell +# create +conda create -n another-skglm-gpu python=3.7 + +# activate env +conda activate another-skglm-gpu +``` + +3. install ``skglm`` in editable mode +```shell +pip install skglm -e . +``` + +4. install dependencies +```shell +# cupy +conda conda install -c conda-forge cupy cudatoolkit=11.5 + +# jax +conda install jaxlib=*=*cuda* jax cuda-nvcc -c conda-forge -c nvidia +``` From 3b950863edaf9179bc4e03463d8539f303e018e4 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Tue, 14 Mar 2023 11:33:21 +0100 Subject: [PATCH 15/52] fix conda env name --- skglm/gpu/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skglm/gpu/README.md b/skglm/gpu/README.md index 070150ca0..2dcf69256 100644 --- a/skglm/gpu/README.md +++ b/skglm/gpu/README.md @@ -13,10 +13,10 @@ git checkout skglm-gpu 2. create then activate``conda`` environnement ```shell # create -conda create -n another-skglm-gpu python=3.7 +conda create -n skglm-gpu python=3.7 # activate env -conda activate another-skglm-gpu +conda activate skglm-gpu ``` 3. install ``skglm`` in editable mode From 204c5e7ac104126490f21e0ad57a7e8a105e7dff Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Wed, 15 Mar 2023 15:50:59 +0100 Subject: [PATCH 16/52] fix bug numba solver --- skglm/gpu/solvers/numba_solver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index 5bc9695c7..e42a19a99 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -48,7 +48,7 @@ def solve(self, X, y, lmbd): # inplace update of minus residual _compute_minus_residual[n_blocks_axis_0, N_THREADS]( - X_gpu, y_gpu, w, minus_residual) + X_gpu, y_gpu, mid_w, minus_residual) # inplace update of grad _compute_grad[n_blocks_axis_1, N_THREADS]( From b7ce4fe94180408515c28f96a93990509d268dca Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Wed, 15 Mar 2023 15:51:32 +0100 Subject: [PATCH 17/52] update unittest & example --- skglm/gpu/example.py | 14 ++++++++------ skglm/gpu/tests/test_solver.py | 7 ++----- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/skglm/gpu/example.py b/skglm/gpu/example.py index 8355489d3..a41ff4676 100644 --- a/skglm/gpu/example.py +++ b/skglm/gpu/example.py @@ -22,24 +22,26 @@ lmbd_max = norm(X.T @ y, ord=np.inf) lmbd = reg * lmbd_max -solver = NumbaSolver(verbose=0) -solver.max_iter = 10 -solver.solve(X, y, lmbd) +# cache numba compilation +NumbaSolver(verbose=0, max_iter=2).solve(X, y, lmbd) +solver_gpu = NumbaSolver() # solve problem start = time.perf_counter() -solver.max_iter = 1000 -w_gpu = solver.solve(X, y, lmbd) +w_gpu = solver_gpu.solve(X, y, lmbd) end = time.perf_counter() print("gpu time: ", end - start) +# cache numba compilation +CPUSolver(max_iter=2).solve(X, y, lmbd) + solver_cpu = CPUSolver() start = time.perf_counter() w_cpu = solver_cpu.solve(X, y, lmbd) end = time.perf_counter() -print("sklearn time: ", end - start) +print("cpu time: ", end - start) print( diff --git a/skglm/gpu/tests/test_solver.py b/skglm/gpu/tests/test_solver.py index 9832a4fd3..6cb3700d3 100644 --- a/skglm/gpu/tests/test_solver.py +++ b/skglm/gpu/tests/test_solver.py @@ -12,7 +12,7 @@ CPUSolver(), JaxSolver(use_auto_diff=False), JaxSolver(use_auto_diff=True), - NumbaSolver(max_iter=5_000)]) + NumbaSolver()]) def test_solves(solver): random_state = 1265 n_samples, n_features = 100, 30 @@ -31,7 +31,4 @@ def test_solves(solver): stop_crit = eval_opt_crit(X, y, lmbd, w) - # Numba GPU is not that precise - # needs investigation: (n threads and blocks), dtype? - atol = 1e-4 if isinstance(solver, NumbaSolver) else 1e-9 - np.testing.assert_allclose(stop_crit, 0., atol=atol) + np.testing.assert_allclose(stop_crit, 0., atol=1e-9) From 71819a051672520faab716ffb1dec187e7225e3a Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Wed, 15 Mar 2023 17:34:36 +0100 Subject: [PATCH 18/52] fix bug init numba --- skglm/gpu/example.py | 18 +++++++++++++----- skglm/gpu/solvers/numba_solver.py | 22 +++++++++++++++------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/skglm/gpu/example.py b/skglm/gpu/example.py index a41ff4676..3ab55a215 100644 --- a/skglm/gpu/example.py +++ b/skglm/gpu/example.py @@ -1,31 +1,39 @@ import time +import warnings import numpy as np from numpy.linalg import norm +from benchopt.datasets import make_correlated_data + from skglm.gpu.solvers import NumbaSolver, CPUSolver from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit +from numba.core.errors import NumbaPerformanceWarning +warnings.filterwarnings('ignore', category=NumbaPerformanceWarning) + -random_state = 1265 +random_state = 27 n_samples, n_features = 10_000, 500 reg = 1e-2 # generate dummy data rng = np.random.RandomState(random_state) -X = rng.randn(n_samples, n_features) -y = rng.randn(n_samples) + +X, y, _ = make_correlated_data(n_samples, n_features, random_state=rng) # set lambda lmbd_max = norm(X.T @ y, ord=np.inf) lmbd = reg * lmbd_max +max_iter = 0 + # cache numba compilation NumbaSolver(verbose=0, max_iter=2).solve(X, y, lmbd) -solver_gpu = NumbaSolver() +solver_gpu = NumbaSolver(verbose=1, max_iter=max_iter) # solve problem start = time.perf_counter() w_gpu = solver_gpu.solve(X, y, lmbd) @@ -37,7 +45,7 @@ # cache numba compilation CPUSolver(max_iter=2).solve(X, y, lmbd) -solver_cpu = CPUSolver() +solver_cpu = CPUSolver(verbose=1, max_iter=max_iter) start = time.perf_counter() w_cpu = solver_cpu.solve(X, y, lmbd) end = time.perf_counter() diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index e42a19a99..935f6dfa6 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -35,10 +35,14 @@ def solve(self, X, y, lmbd): y_gpu = cuda.to_device(y) # init vars on device - w = cuda.device_array(n_features) - old_w = cuda.device_array(n_features) - mid_w = cuda.device_array(n_features) - + # CAUTION: should be init with specific values + # otherwise, stale values in GPU memory are used + w = cuda.to_device(np.zeros(n_features)) + mid_w = cuda.to_device(np.zeros(n_features)) + old_w = cuda.to_device(np.zeros(n_features)) + + # needn't to be init with values as theses are + # they store results of computation grad = cuda.device_array(n_features) minus_residual = cuda.device_array(n_samples) @@ -46,15 +50,18 @@ def solve(self, X, y, lmbd): for it in range(self.max_iter): - # inplace update of minus residual + # inplace update of minus_residual + # minus_residual = X_gpu @ mid_w - y_gpu _compute_minus_residual[n_blocks_axis_0, N_THREADS]( X_gpu, y_gpu, mid_w, minus_residual) # inplace update of grad + # grad = X_gpu @ minus_residual _compute_grad[n_blocks_axis_1, N_THREADS]( X_gpu, minus_residual, grad) # inplace update of w + # w = ST(mid_w - step * grad, step * lmbd) _forward_backward[n_blocks_axis_1, N_THREADS]( mid_w, grad, step, lmbd, w) @@ -70,6 +77,7 @@ def solve(self, X, y, lmbd): # extrapolate coef = (t_old - 1) / t_new + # mid_w = w + coef * (w - old_w) _extrapolate[n_blocks_axis_1, N_THREADS]( w, old_w, coef, mid_w) @@ -86,7 +94,7 @@ def solve(self, X, y, lmbd): @cuda.jit -def _compute_minus_residual(X_gpu, y_gpu, w, out): +def _compute_minus_residual(X_gpu, y_gpu, mid_w, out): # compute: out = X_gpu @ w - y_gpu i = cuda.grid(1) @@ -96,7 +104,7 @@ def _compute_minus_residual(X_gpu, y_gpu, w, out): tmp = 0. for j in range(n_features): - tmp += X_gpu[i, j] * w[j] + tmp += X_gpu[i, j] * mid_w[j] tmp -= y_gpu[i] out[i] = tmp From b6372f9bb06a8a4cc5886ffdbdd05fae9d94093f Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 17 Mar 2023 11:44:50 +0100 Subject: [PATCH 19/52] base class for Fista solvers --- skglm/gpu/solvers/base.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 skglm/gpu/solvers/base.py diff --git a/skglm/gpu/solvers/base.py b/skglm/gpu/solvers/base.py new file mode 100644 index 000000000..3c077b60a --- /dev/null +++ b/skglm/gpu/solvers/base.py @@ -0,0 +1,19 @@ +from abc import abstractmethod + +import numpy as np +from scipy import sparse +from scipy.sparse import linalg as spicy_linalg + + +class BaseFistaSolver: + + @abstractmethod + def solve(self, X, y, lmbd): + ... + + @staticmethod + def get_lipschitz_cst(X): + if sparse.issparse(X): + return spicy_linalg.svds(X, k=1)[1][0] ** 2 + + return np.linalg.norm(X, ord=2) ** 2 From 97ceb4d26988bc09dfc6a5aa00835b5d302e2b1b Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 17 Mar 2023 11:45:14 +0100 Subject: [PATCH 20/52] sparse matrix support CPU & CuPy --- skglm/gpu/solvers/cpu_solver.py | 6 ++++-- skglm/gpu/solvers/cupy_solver.py | 14 ++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/skglm/gpu/solvers/cpu_solver.py b/skglm/gpu/solvers/cpu_solver.py index 2c8f6a92f..9501781d5 100644 --- a/skglm/gpu/solvers/cpu_solver.py +++ b/skglm/gpu/solvers/cpu_solver.py @@ -1,10 +1,12 @@ import numpy as np +from skglm.gpu.solvers.base import BaseFistaSolver + from skglm.utils.prox_funcs import ST_vec from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit -class CPUSolver: +class CPUSolver(BaseFistaSolver): def __init__(self, max_iter=1000, verbose=0): self.max_iter = max_iter @@ -14,7 +16,7 @@ def solve(self, X, y, lmbd): n_samples, n_features = X.shape # compute step - lipschitz = np.linalg.norm(X, ord=2) ** 2 + lipschitz = CPUSolver.get_lipschitz_cst(X) if lipschitz == 0.: return np.zeros(n_features) diff --git a/skglm/gpu/solvers/cupy_solver.py b/skglm/gpu/solvers/cupy_solver.py index 153ca3c1c..67a9907de 100644 --- a/skglm/gpu/solvers/cupy_solver.py +++ b/skglm/gpu/solvers/cupy_solver.py @@ -1,10 +1,14 @@ import cupy as cp +import cupyx.scipy.sparse as cpx + import numpy as np +from scipy import sparse +from skglm.gpu.solvers.base import BaseFistaSolver from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit -class CupySolver: +class CupySolver(BaseFistaSolver): def __init__(self, max_iter=1000, verbose=0): self.max_iter = max_iter @@ -14,14 +18,16 @@ def solve(self, X, y, lmbd): n_samples, n_features = X.shape # compute step - lipschitz = np.linalg.norm(X, ord=2) ** 2 + lipschitz = CupySolver.get_lipschitz_cst(X) if lipschitz == 0.: return np.zeros(n_features) step = 1 / lipschitz + is_X_sparse = sparse.issparse(X) + # transfer to device - X_gpu = cp.array(X) + X_gpu = cp.array(X) if not is_X_sparse else cpx.csr_matrix(X) y_gpu = cp.array(y) # init vars in device @@ -35,7 +41,7 @@ def solve(self, X, y, lmbd): for it in range(self.max_iter): # compute grad - cp.dot(X_gpu.T, X_gpu @ mid_w - y_gpu, out=grad) + grad = X_gpu.T @ (X_gpu @ mid_w - y_gpu) # forward / backward: w = ST(mid_w - step * grad, step * lmbd) mid_w = mid_w - step * grad From 2ad2f2ca6254b0983a20a694c3cc17d2f9b6cbe3 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 17 Mar 2023 11:45:24 +0100 Subject: [PATCH 21/52] unittest sparse data --- skglm/gpu/tests/test_solver.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/skglm/gpu/tests/test_solver.py b/skglm/gpu/tests/test_solver.py index 6cb3700d3..924a7064a 100644 --- a/skglm/gpu/tests/test_solver.py +++ b/skglm/gpu/tests/test_solver.py @@ -2,6 +2,7 @@ import numpy as np from numpy.linalg import norm +from scipy import sparse from skglm.gpu.solvers import CPUSolver, CupySolver, JaxSolver, NumbaSolver @@ -32,3 +33,25 @@ def test_solves(solver): stop_crit = eval_opt_crit(X, y, lmbd, w) np.testing.assert_allclose(stop_crit, 0., atol=1e-9) + + +@pytest.mark.parametrize("solver", [CPUSolver(), CupySolver()]) +def test_solves_sparse(solver): + random_state = 1265 + n_samples, n_features = 100, 30 + reg = 1e-2 + + # generate dummy data + rng = np.random.RandomState(random_state) + X = sparse.rand(n_samples, n_features, density=0.1, + format="csr", random_state=rng) + y = rng.randn(n_samples) + + # set lambda + lmbd_max = norm(X.T @ y, ord=np.inf) + lmbd = reg * lmbd_max + + w = solver.solve(X, y, lmbd) + + stop_crit = eval_opt_crit(X, y, lmbd, w) + np.testing.assert_allclose(stop_crit, 0., atol=1e-7) From b8c753a01463a511dbeb52fc3920a41797b1d71c Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 10 Apr 2023 18:29:30 +0200 Subject: [PATCH 22/52] base quadratic and L1 --- skglm/gpu/solvers/base.py | 54 +++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/skglm/gpu/solvers/base.py b/skglm/gpu/solvers/base.py index 3c077b60a..a249d04b6 100644 --- a/skglm/gpu/solvers/base.py +++ b/skglm/gpu/solvers/base.py @@ -1,19 +1,63 @@ +from numba import njit from abc import abstractmethod import numpy as np from scipy import sparse from scipy.sparse import linalg as spicy_linalg +from skglm.utils.prox_funcs import ST_vec + class BaseFistaSolver: @abstractmethod - def solve(self, X, y, lmbd): + def solve(self, X, y, datafit, penalty): ... - @staticmethod - def get_lipschitz_cst(X): + +class BaseQuadratic: + + def value(self, X, y, w): + """parameters are numpy/scipy arrays.""" + return ((y - X @ w) ** 2).sum() / len(y) + + def gradient(self, X, y, w, Xw): + return X.T @ (Xw - y) / len(y) + + def get_lipschitz_cst(self, X): + n_samples = len(X) + if sparse.issparse(X): - return spicy_linalg.svds(X, k=1)[1][0] ** 2 + return spicy_linalg.svds(X, k=1)[1][0] ** 2 / n_samples + + return np.linalg.norm(X, ord=2) ** 2 / n_samples + + +class BaseL1: + + def __init__(self, alpha): + self.alpha = alpha + + def prox(self, value, stepsize): + return ST_vec(value, self.alpha * stepsize) + + def max_subdiff_distance(self, w, grad): + return BaseL1._compute_dist_subdiff(w, grad, self.alpha) + + @staticmethod + @njit("f8(f8[:], f8[:], f8)") + def _compute_max_subdiff_distance(w, grad, lmbd): + max_dist = 0. + + for i in range(len(w)): + grad_i = grad[i] + w_i = w[i] + + if w[i] == 0.: + dist = max(abs(grad_i) - lmbd, 0.) + else: + dist = abs(grad_i + np.sign(w_i) * lmbd) + + max_dist = max(max_dist, dist) - return np.linalg.norm(X, ord=2) ** 2 + return max_dist From 872f9b3c65b6903b53cf5d4c2a61346231020026 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 10 Apr 2023 18:29:46 +0200 Subject: [PATCH 23/52] refactor CPU solver --- skglm/gpu/solvers/cpu_solver.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/skglm/gpu/solvers/cpu_solver.py b/skglm/gpu/solvers/cpu_solver.py index 9501781d5..89ee6fce5 100644 --- a/skglm/gpu/solvers/cpu_solver.py +++ b/skglm/gpu/solvers/cpu_solver.py @@ -1,10 +1,6 @@ import numpy as np - from skglm.gpu.solvers.base import BaseFistaSolver -from skglm.utils.prox_funcs import ST_vec -from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit - class CPUSolver(BaseFistaSolver): @@ -12,11 +8,11 @@ def __init__(self, max_iter=1000, verbose=0): self.max_iter = max_iter self.verbose = verbose - def solve(self, X, y, lmbd): + def solve(self, X, y, datafit, penalty): n_samples, n_features = X.shape # compute step - lipschitz = CPUSolver.get_lipschitz_cst(X) + lipschitz = datafit.get_lipschitz_cst(X) if lipschitz == 0.: return np.zeros(n_features) @@ -33,15 +29,16 @@ def solve(self, X, y, lmbd): for it in range(self.max_iter): # compute grad - grad = X.T @ (X @ mid_w - y) + grad = datafit.gradient(X, y, mid_w, X @ mid_w) # forward / backward - mid_w = mid_w - step * grad - w = ST_vec(mid_w, step * lmbd) + w = penalty.prox(mid_w - step * grad, step) if self.verbose: - p_obj = compute_obj(X, y, lmbd, w) - opt_crit = eval_opt_crit(X, y, lmbd, w) + p_obj = datafit.value(X, y, w, X @ w) + penalty.value(w) + + grad = datafit.gradient(X, y, mid_w, X @ mid_w) + opt_crit = penalty.max_subdiff_distance(self, w, grad) print( f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" From 0aa15b3ecaa97ada1dd330d3494226206b3d502a Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 10 Apr 2023 18:47:52 +0200 Subject: [PATCH 24/52] test utils and fixes --- skglm/gpu/solvers/base.py | 9 ++++++--- skglm/gpu/tests/test_utils.py | 4 ++-- skglm/gpu/utils/host_utils.py | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/skglm/gpu/solvers/base.py b/skglm/gpu/solvers/base.py index a249d04b6..018890301 100644 --- a/skglm/gpu/solvers/base.py +++ b/skglm/gpu/solvers/base.py @@ -17,9 +17,9 @@ def solve(self, X, y, datafit, penalty): class BaseQuadratic: - def value(self, X, y, w): + def value(self, X, y, w, Xw): """parameters are numpy/scipy arrays.""" - return ((y - X @ w) ** 2).sum() / len(y) + return ((y - X @ w) ** 2).sum() / (2 * len(y)) def gradient(self, X, y, w, Xw): return X.T @ (Xw - y) / len(y) @@ -38,11 +38,14 @@ class BaseL1: def __init__(self, alpha): self.alpha = alpha + def value(self, w): + return self.alpha * np.abs(w).sum() + def prox(self, value, stepsize): return ST_vec(value, self.alpha * stepsize) def max_subdiff_distance(self, w, grad): - return BaseL1._compute_dist_subdiff(w, grad, self.alpha) + return BaseL1._compute_max_subdiff_distance(w, grad, self.alpha) @staticmethod @njit("f8(f8[:], f8[:], f8)") diff --git a/skglm/gpu/tests/test_utils.py b/skglm/gpu/tests/test_utils.py index 3df4b5db7..43daee2a2 100644 --- a/skglm/gpu/tests/test_utils.py +++ b/skglm/gpu/tests/test_utils.py @@ -14,7 +14,7 @@ def test_compute_obj(): p_obj = compute_obj(X, y, lmbd, w) - np.testing.assert_array_equal(p_obj, 0.5 * 20 + 10. * 6) + np.testing.assert_array_equal(p_obj, 20 / (2 * 3) + 10. * 6) def test_eval_optimality(): @@ -26,7 +26,7 @@ def test_eval_optimality(): lmbd = 1. estimator = Lasso( - alpha=lmbd / n_samples, fit_intercept=False, tol=1e-9 + alpha=lmbd, fit_intercept=False, tol=1e-9 ).fit(X, y) np.testing.assert_allclose( diff --git a/skglm/gpu/utils/host_utils.py b/skglm/gpu/utils/host_utils.py index 659a8de39..62e7d0b11 100644 --- a/skglm/gpu/utils/host_utils.py +++ b/skglm/gpu/utils/host_utils.py @@ -5,11 +5,11 @@ def compute_obj(X, y, lmbd, w): - return 0.5 * norm(y - X @ w) ** 2 + lmbd * norm(w, ord=1) + return norm(y - X @ w) ** 2 / (2 * len(y)) + lmbd * norm(w, ord=1) def eval_opt_crit(X, y, lmbd, w): - grad = X.T @ (X @ w - y) + grad = X.T @ (X @ w - y) / len(y) opt_crit = _compute_dist_subdiff(w, grad, lmbd) return opt_crit From 14efe7bacacf22034c2660c5cd3fa4a76542f86e Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 10 Apr 2023 18:48:10 +0200 Subject: [PATCH 25/52] unittest FISTA CPU --- skglm/gpu/solvers/cpu_solver.py | 2 +- skglm/gpu/tests/test_fista.py | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 skglm/gpu/tests/test_fista.py diff --git a/skglm/gpu/solvers/cpu_solver.py b/skglm/gpu/solvers/cpu_solver.py index 89ee6fce5..f27f89e5e 100644 --- a/skglm/gpu/solvers/cpu_solver.py +++ b/skglm/gpu/solvers/cpu_solver.py @@ -38,7 +38,7 @@ def solve(self, X, y, datafit, penalty): p_obj = datafit.value(X, y, w, X @ w) + penalty.value(w) grad = datafit.gradient(X, y, mid_w, X @ mid_w) - opt_crit = penalty.max_subdiff_distance(self, w, grad) + opt_crit = penalty.max_subdiff_distance(w, grad) print( f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py new file mode 100644 index 000000000..f20ef7371 --- /dev/null +++ b/skglm/gpu/tests/test_fista.py @@ -0,0 +1,36 @@ +import pytest + +import numpy as np +from numpy.linalg import norm + +from skglm.gpu.solvers import CPUSolver +from skglm.gpu.solvers.base import BaseQuadratic, BaseL1 + +from skglm.gpu.utils.host_utils import eval_opt_crit + + +@pytest.mark.parametrize("solver, datafit_cls, penalty_cls", + [CPUSolver(), BaseQuadratic, BaseL1]) +def test_solves(solver, datafit_cls, penalty_cls): + random_state = 1265 + n_samples, n_features = 100, 30 + reg = 1e-2 + + # generate dummy data + rng = np.random.RandomState(random_state) + X = rng.randn(n_samples, n_features) + y = rng.randn(n_samples) + + # set lambda + lmbd_max = norm(X.T @ y, ord=np.inf) / n_samples + lmbd = reg * lmbd_max + + w = solver.solve(X, y, datafit_cls(), penalty_cls(lmbd)) + + stop_crit = eval_opt_crit(X, y, lmbd, w) + + np.testing.assert_allclose(stop_crit, 0., atol=1e-9) + + +if __name__ == "__main__": + pass From 05a4f3628753816f619df2e4769522a035afb544 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 10 Apr 2023 18:56:12 +0200 Subject: [PATCH 26/52] sparse data unittest --- skglm/gpu/solvers/base.py | 2 +- skglm/gpu/tests/test_fista.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/skglm/gpu/solvers/base.py b/skglm/gpu/solvers/base.py index 018890301..4082a7f82 100644 --- a/skglm/gpu/solvers/base.py +++ b/skglm/gpu/solvers/base.py @@ -25,7 +25,7 @@ def gradient(self, X, y, w, Xw): return X.T @ (Xw - y) / len(y) def get_lipschitz_cst(self, X): - n_samples = len(X) + n_samples = X.shape[0] if sparse.issparse(X): return spicy_linalg.svds(X, k=1)[1][0] ** 2 / n_samples diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index f20ef7371..c12942e27 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -1,5 +1,7 @@ import pytest +from scipy import sparse + import numpy as np from numpy.linalg import norm @@ -10,15 +12,20 @@ @pytest.mark.parametrize("solver, datafit_cls, penalty_cls", - [CPUSolver(), BaseQuadratic, BaseL1]) -def test_solves(solver, datafit_cls, penalty_cls): + [(CPUSolver(), BaseQuadratic, BaseL1)]) +@pytest.mark.parametrize("sparse_X", [True, False]) +def test_solves(sparse_X, solver, datafit_cls, penalty_cls): random_state = 1265 n_samples, n_features = 100, 30 reg = 1e-2 # generate dummy data rng = np.random.RandomState(random_state) - X = rng.randn(n_samples, n_features) + if sparse_X: + X = sparse.rand(n_samples, n_features, density=0.1, + format="csr", random_state=rng) + else: + X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) # set lambda From d23ac128511fc6bb3ed298337393ef5bb03f1f13 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 10 Apr 2023 19:09:03 +0200 Subject: [PATCH 27/52] modular CuPy solver --- skglm/gpu/solvers/cupy_solver.py | 26 ++++++++++++++++---------- skglm/gpu/tests/test_fista.py | 5 ++++- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/skglm/gpu/solvers/cupy_solver.py b/skglm/gpu/solvers/cupy_solver.py index 67a9907de..7bf8e5578 100644 --- a/skglm/gpu/solvers/cupy_solver.py +++ b/skglm/gpu/solvers/cupy_solver.py @@ -4,8 +4,7 @@ import numpy as np from scipy import sparse -from skglm.gpu.solvers.base import BaseFistaSolver -from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit +from skglm.gpu.solvers.base import BaseFistaSolver, BaseL1 class CupySolver(BaseFistaSolver): @@ -14,11 +13,11 @@ def __init__(self, max_iter=1000, verbose=0): self.max_iter = max_iter self.verbose = verbose - def solve(self, X, y, lmbd): + def solve(self, X, y, datafit, penalty): n_samples, n_features = X.shape # compute step - lipschitz = CupySolver.get_lipschitz_cst(X) + lipschitz = datafit.get_lipschitz_cst(X) if lipschitz == 0.: return np.zeros(n_features) @@ -41,17 +40,18 @@ def solve(self, X, y, lmbd): for it in range(self.max_iter): # compute grad - grad = X_gpu.T @ (X_gpu @ mid_w - y_gpu) + grad = datafit.gradient(X_gpu, y_gpu, mid_w, X_gpu @ mid_w) - # forward / backward: w = ST(mid_w - step * grad, step * lmbd) - mid_w = mid_w - step * grad - w = cp.sign(mid_w) * cp.maximum(cp.abs(mid_w) - step * lmbd, 0.) + # forward / backward + w = penalty.prox(mid_w - step * grad, step) if self.verbose: w_cpu = cp.asnumpy(w) - p_obj = compute_obj(X, y, lmbd, w_cpu) - opt_crit = eval_opt_crit(X, y, lmbd, w_cpu) + p_obj = datafit.value(X, y, w, X @ w) + penalty.value(w) + + grad = datafit.gradient(X, y, mid_w, X @ mid_w) + opt_crit = penalty.max_subdiff_distance(w, grad) print( f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" @@ -69,3 +69,9 @@ def solve(self, X, y, lmbd): w_cpu = cp.asnumpy(w) return w_cpu + + +class L1CuPy(BaseL1): + + def prox(self, value, stepsize): + return cp.sign(value) * cp.maximum(cp.abs(value) - stepsize * self.alpha, 0.) diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index c12942e27..51419dde4 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -8,11 +8,14 @@ from skglm.gpu.solvers import CPUSolver from skglm.gpu.solvers.base import BaseQuadratic, BaseL1 +from skglm.gpu.solvers.cupy_solver import CupySolver, L1CuPy + from skglm.gpu.utils.host_utils import eval_opt_crit @pytest.mark.parametrize("solver, datafit_cls, penalty_cls", - [(CPUSolver(), BaseQuadratic, BaseL1)]) + [(CPUSolver(), BaseQuadratic, BaseL1), + (CupySolver(), BaseQuadratic, L1CuPy)]) @pytest.mark.parametrize("sparse_X", [True, False]) def test_solves(sparse_X, solver, datafit_cls, penalty_cls): random_state = 1265 From 601eb868725147bcc2863353ba086f0a56b5eb77 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 10 Apr 2023 20:16:53 +0200 Subject: [PATCH 28/52] fix cupy verbose --- skglm/gpu/solvers/cupy_solver.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skglm/gpu/solvers/cupy_solver.py b/skglm/gpu/solvers/cupy_solver.py index 7bf8e5578..4bbab4513 100644 --- a/skglm/gpu/solvers/cupy_solver.py +++ b/skglm/gpu/solvers/cupy_solver.py @@ -48,10 +48,10 @@ def solve(self, X, y, datafit, penalty): if self.verbose: w_cpu = cp.asnumpy(w) - p_obj = datafit.value(X, y, w, X @ w) + penalty.value(w) + p_obj = datafit.value(X_gpu, y_gpu, w, X_gpu @ w) + penalty.value(w) - grad = datafit.gradient(X, y, mid_w, X @ mid_w) - opt_crit = penalty.max_subdiff_distance(w, grad) + grad = datafit.gradient(X, y, w_cpu, X @ w_cpu) + opt_crit = penalty.max_subdiff_distance(w_cpu, grad) print( f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" From 761ab5450b094cfce634e8a6851cfad0db495ebd Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 10 Apr 2023 21:14:50 +0200 Subject: [PATCH 29/52] modular jax --- skglm/gpu/solvers/jax_solver.py | 48 ++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/skglm/gpu/solvers/jax_solver.py b/skglm/gpu/solvers/jax_solver.py index 54a21bda0..11a25b22e 100644 --- a/skglm/gpu/solvers/jax_solver.py +++ b/skglm/gpu/solvers/jax_solver.py @@ -12,21 +12,21 @@ # if not, amplifies rounding errors. jax.config.update("jax_enable_x64", True) # noqa -from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit # noqa +from skglm.gpu.solvers.base import BaseFistaSolver, BaseQuadratic, BaseL1 # noqa -class JaxSolver: +class JaxSolver(BaseFistaSolver): def __init__(self, max_iter=1000, use_auto_diff=True, verbose=0): self.max_iter = max_iter self.use_auto_diff = use_auto_diff self.verbose = verbose - def solve(self, X, y, lmbd): + def solve(self, X, y, datafit, penalty): n_samples, n_features = X.shape # compute step - lipschitz = np.linalg.norm(X, ord=2) ** 2 + lipschitz = datafit.get_lipschitz_cst(X) if lipschitz == 0.: return np.zeros(n_features) @@ -38,7 +38,7 @@ def solve(self, X, y, lmbd): # get grad func of datafit if self.use_auto_diff: - grad_quad_loss = jax.grad(_quad_loss) + auto_grad = jax.jit(jax.grad(datafit.value, argnums=-1)) # init vars in device w = jnp.zeros(n_features) @@ -52,19 +52,25 @@ def solve(self, X, y, lmbd): # compute grad if self.use_auto_diff: - grad = grad_quad_loss(mid_w, X_gpu, y_gpu) + grad = auto_grad(X_gpu, y_gpu, mid_w) else: - grad = jnp.dot(X_gpu.T, jnp.dot(X_gpu, mid_w) - y_gpu) + grad = datafit.gradient(X_gpu, y_gpu, mid_w) # forward / backward - mid_w = mid_w - step * grad - w = jnp.sign(mid_w) * jnp.maximum(jnp.abs(mid_w) - step * lmbd, 0.) + val = mid_w - step * grad + w = penalty.prox(val, step) if self.verbose: - w_cpu = np.asarray(w, dtype=np.float64) + p_obj = datafit.value(X_gpu, y_gpu, w) + penalty.value(w) + + if self.use_auto_diff: + grad = auto_grad(X_gpu, y_gpu, w) + else: + grad = datafit.gradient(X_gpu, y_gpu, w) - p_obj = compute_obj(X, y, lmbd, w_cpu) - opt_crit = eval_opt_crit(X, y, lmbd, w_cpu) + w_cpu = np.asarray(w, dtype=np.float64) + grad_cpu = np.asarray(grad, dtype=np.float64) + opt_crit = penalty.max_subdiff_distance(w_cpu, grad_cpu) print( f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" @@ -84,6 +90,18 @@ def solve(self, X, y, lmbd): return w_cpu -def _quad_loss(w, X_gpu, y_gpu): - pred_y = jnp.dot(X_gpu, w) - return 0.5 * jnp.sum((y_gpu - pred_y) ** 2) +class QuadraticJax(BaseQuadratic): + + def value(self, X_gpu, y_gpu, w): + n_samples = X_gpu.shape[0] + return jnp.sum((jnp.dot(X_gpu, w) - y_gpu) ** 2) / (2. * n_samples) + + def gradient(self, X_gpu, y_gpu, w): + n_samples = X_gpu.shape[0] + return jnp.dot(X_gpu.T, jnp.dot(X_gpu, w) - y_gpu) / n_samples + + +class L1Jax(BaseL1): + + def prox(self, value, stepsize): + return jnp.sign(value) * jnp.maximum(jnp.abs(value) - stepsize * self.alpha, 0.) From 6274e5f1aa998b240f8002abaa939a1718b40725 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Mon, 10 Apr 2023 21:15:13 +0200 Subject: [PATCH 30/52] unittest jax --- skglm/gpu/solvers/cupy_solver.py | 6 +++--- skglm/gpu/tests/test_fista.py | 10 ++++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/skglm/gpu/solvers/cupy_solver.py b/skglm/gpu/solvers/cupy_solver.py index 4bbab4513..43203e086 100644 --- a/skglm/gpu/solvers/cupy_solver.py +++ b/skglm/gpu/solvers/cupy_solver.py @@ -43,13 +43,13 @@ def solve(self, X, y, datafit, penalty): grad = datafit.gradient(X_gpu, y_gpu, mid_w, X_gpu @ mid_w) # forward / backward - w = penalty.prox(mid_w - step * grad, step) + val = mid_w - step * grad + w = penalty.prox(val, step) if self.verbose: - w_cpu = cp.asnumpy(w) - p_obj = datafit.value(X_gpu, y_gpu, w, X_gpu @ w) + penalty.value(w) + w_cpu = cp.asnumpy(w) grad = datafit.gradient(X, y, w_cpu, X @ w_cpu) opt_crit = penalty.max_subdiff_distance(w_cpu, grad) diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index 51419dde4..d50380559 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -9,15 +9,21 @@ from skglm.gpu.solvers.base import BaseQuadratic, BaseL1 from skglm.gpu.solvers.cupy_solver import CupySolver, L1CuPy +from skglm.gpu.solvers.jax_solver import JaxSolver, QuadraticJax, L1Jax from skglm.gpu.utils.host_utils import eval_opt_crit @pytest.mark.parametrize("solver, datafit_cls, penalty_cls", [(CPUSolver(), BaseQuadratic, BaseL1), - (CupySolver(), BaseQuadratic, L1CuPy)]) + (CupySolver(), BaseQuadratic, L1CuPy), + (JaxSolver(use_auto_diff=True), QuadraticJax, L1Jax), + (JaxSolver(use_auto_diff=False), QuadraticJax, L1Jax)]) @pytest.mark.parametrize("sparse_X", [True, False]) def test_solves(sparse_X, solver, datafit_cls, penalty_cls): + if sparse_X and isinstance(solver, JaxSolver): + pytest.xfail(reason="Sparse X is still not yet supported") + random_state = 1265 n_samples, n_features = 100, 30 reg = 1e-2 @@ -26,7 +32,7 @@ def test_solves(sparse_X, solver, datafit_cls, penalty_cls): rng = np.random.RandomState(random_state) if sparse_X: X = sparse.rand(n_samples, n_features, density=0.1, - format="csr", random_state=rng) + format="csc", random_state=rng) else: X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) From 3af102e83fc231581c5210b665c5b78fce255d59 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Tue, 11 Apr 2023 15:42:16 +0200 Subject: [PATCH 31/52] sparse matrices modular jax --- skglm/gpu/solvers/jax_solver.py | 16 +++++++++++++--- skglm/gpu/tests/test_fista.py | 3 --- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/skglm/gpu/solvers/jax_solver.py b/skglm/gpu/solvers/jax_solver.py index 11a25b22e..af8334c72 100644 --- a/skglm/gpu/solvers/jax_solver.py +++ b/skglm/gpu/solvers/jax_solver.py @@ -12,6 +12,9 @@ # if not, amplifies rounding errors. jax.config.update("jax_enable_x64", True) # noqa +from scipy import sparse # noqa +from jax.experimental import sparse as jax_sparse # noqa + from skglm.gpu.solvers.base import BaseFistaSolver, BaseQuadratic, BaseL1 # noqa @@ -33,7 +36,14 @@ def solve(self, X, y, datafit, penalty): step = 1 / lipschitz # transfer to device - X_gpu = jnp.asarray(X) + if sparse.issparse(X): + # sparse matrices are still an experimental features in jax + # matrix operation are supported only for COO matrices but missing + # for CSC, CSR. hence working with COO in the wait for a new Jax release + # that adds support for these features + X_gpu = jax_sparse.BCOO.from_scipy_sparse(X) + else: + X_gpu = jnp.asarray(X) y_gpu = jnp.asarray(y) # get grad func of datafit @@ -94,11 +104,11 @@ class QuadraticJax(BaseQuadratic): def value(self, X_gpu, y_gpu, w): n_samples = X_gpu.shape[0] - return jnp.sum((jnp.dot(X_gpu, w) - y_gpu) ** 2) / (2. * n_samples) + return jnp.sum((X_gpu @ w - y_gpu) ** 2) / (2. * n_samples) def gradient(self, X_gpu, y_gpu, w): n_samples = X_gpu.shape[0] - return jnp.dot(X_gpu.T, jnp.dot(X_gpu, w) - y_gpu) / n_samples + return X_gpu.T @ (X_gpu @ w - y_gpu) / n_samples class L1Jax(BaseL1): diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index d50380559..3ffab2f87 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -21,9 +21,6 @@ (JaxSolver(use_auto_diff=False), QuadraticJax, L1Jax)]) @pytest.mark.parametrize("sparse_X", [True, False]) def test_solves(sparse_X, solver, datafit_cls, penalty_cls): - if sparse_X and isinstance(solver, JaxSolver): - pytest.xfail(reason="Sparse X is still not yet supported") - random_state = 1265 n_samples, n_features = 100, 30 reg = 1e-2 From 715d3fbcc29bf079ab16bcbd8cabcb74cdee5309 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Wed, 12 Apr 2023 11:28:22 +0200 Subject: [PATCH 32/52] modular Numba solver --- skglm/gpu/solvers/numba_solver.py | 116 +++++++++++++++++++----------- 1 file changed, 74 insertions(+), 42 deletions(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index 935f6dfa6..50b6ecc0b 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -2,33 +2,40 @@ import numpy as np from numba import cuda +from skglm.gpu.solvers.base import BaseL1, BaseQuadratic, BaseFistaSolver -from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit +import warnings +from numba.core.errors import NumbaPerformanceWarning +warnings.filterwarnings("ignore", category=NumbaPerformanceWarning) -# max number of threads that one block could contain -# https://forums.developer.nvidia.com/t/maximum-number-of-threads-on-thread-block/46392 -N_THREADS = 1024 +# Built from GPU properties +# Refer to utils to get GPU properties +MAX_1DIM_BLOCK = (1024,) +MAX_2DIM_BLOCK = (32, 32) +MAX_1DIM_GRID = (65535,) +MAX_2DIM_GRID = (65535, 65535) -class NumbaSolver: + +class NumbaSolver(BaseFistaSolver): def __init__(self, max_iter=1000, verbose=0): self.max_iter = max_iter self.verbose = verbose - def solve(self, X, y, lmbd): + def solve(self, X, y, datafit, penalty): n_samples, n_features = X.shape # compute step - lipschitz = np.linalg.norm(X, ord=2) ** 2 + lipschitz = datafit.get_lipschitz_cst(X) if lipschitz == 0.: return np.zeros(n_features) step = 1 / lipschitz - # number of block to use along each axis when calling kernel - n_blocks_axis_0, n_blocks_axis_1 = (math.ceil(n / N_THREADS) for n in X.shape) + # number of block to use along features-axis when launching kernel + n_blocks_axis_1 = math.ceil(X.shape[1] / MAX_1DIM_BLOCK[0]) # transfer to device X_gpu = cuda.to_device(X) @@ -41,35 +48,30 @@ def solve(self, X, y, lmbd): mid_w = cuda.to_device(np.zeros(n_features)) old_w = cuda.to_device(np.zeros(n_features)) - # needn't to be init with values as theses are - # they store results of computation + # needn't to be init with values as it stores results of computation grad = cuda.device_array(n_features) - minus_residual = cuda.device_array(n_samples) t_old, t_new = 1, 1 for it in range(self.max_iter): - # inplace update of minus_residual - # minus_residual = X_gpu @ mid_w - y_gpu - _compute_minus_residual[n_blocks_axis_0, N_THREADS]( - X_gpu, y_gpu, mid_w, minus_residual) - # inplace update of grad - # grad = X_gpu @ minus_residual - _compute_grad[n_blocks_axis_1, N_THREADS]( - X_gpu, minus_residual, grad) + datafit.gradient(X_gpu, y_gpu, mid_w, grad) + + _forward[n_blocks_axis_1, MAX_1DIM_BLOCK](mid_w, grad, step, mid_w) # inplace update of w - # w = ST(mid_w - step * grad, step * lmbd) - _forward_backward[n_blocks_axis_1, N_THREADS]( - mid_w, grad, step, lmbd, w) + penalty.prox(mid_w, step, w) if self.verbose: w_cpu = w.copy_to_host() - p_obj = compute_obj(X, y, lmbd, w_cpu) - opt_crit = eval_opt_crit(X, y, lmbd, w_cpu) + p_obj = datafit.value(X, y, w_cpu, X @ w_cpu) + penalty.value(w_cpu) + + datafit.gradient(X_gpu, y_gpu, w, grad) + grad_cpu = grad.copy_to_host() + + opt_crit = penalty.max_subdiff_distance(w_cpu, grad_cpu) print( f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" @@ -78,7 +80,7 @@ def solve(self, X, y, lmbd): # extrapolate coef = (t_old - 1) / t_new # mid_w = w + coef * (w - old_w) - _extrapolate[n_blocks_axis_1, N_THREADS]( + _extrapolate[n_blocks_axis_1, MAX_1DIM_BLOCK]( w, old_w, coef, mid_w) # update FISTA vars @@ -93,8 +95,32 @@ def solve(self, X, y, lmbd): return w_cpu +class QuadraticNumba(BaseQuadratic): + + def gradient(self, X_gpu, y_gpu, w, out): + minus_residual = cuda.device_array(X_gpu.shape[0]) + + n_blocks_axis_0, n_blocks_axis_1 = (math.ceil(n / MAX_1DIM_BLOCK[0]) + for n in X_gpu.shape) + + _compute_minus_residual[n_blocks_axis_0, MAX_1DIM_BLOCK]( + X_gpu, y_gpu, w, minus_residual) + + _compute_grad[n_blocks_axis_1, MAX_1DIM_BLOCK](X_gpu, minus_residual, out) + + +class L1Numba(BaseL1): + + def prox(self, value, stepsize, out): + level = stepsize * self.alpha + + n_blocks = math.ceil(value.shape[0] / MAX_1DIM_BLOCK[0]) + + _ST_vec[n_blocks, MAX_1DIM_BLOCK](value, level, out) + + @cuda.jit -def _compute_minus_residual(X_gpu, y_gpu, mid_w, out): +def _compute_minus_residual(X_gpu, y_gpu, w, out): # compute: out = X_gpu @ w - y_gpu i = cuda.grid(1) @@ -104,7 +130,7 @@ def _compute_minus_residual(X_gpu, y_gpu, mid_w, out): tmp = 0. for j in range(n_features): - tmp += X_gpu[i, j] * mid_w[j] + tmp += X_gpu[i, j] * w[j] tmp -= y_gpu[i] out[i] = tmp @@ -121,34 +147,40 @@ def _compute_grad(X_gpu, minus_residual, out): tmp = 0. for i in range(n_samples): - tmp += X_gpu[i, j] * minus_residual[i] + tmp += X_gpu[i, j] * minus_residual[i] / X_gpu.shape[0] out[j] = tmp @cuda.jit -def _forward_backward(mid_w, grad, step, lmbd, out): - # forward: mid_w = mid_w - step * grad - # backward: w = ST_vec(mid_w, step * lmbd) +def _forward(mid_w, grad, step, out): j = cuda.grid(1) n_features = len(mid_w) if j >= n_features: return - # forward - tmp = mid_w[j] - step * grad[j] + out[j] = mid_w[j] - step * grad[j] + + +@cuda.jit +def _ST_vec(value, level, out): + j = cuda.grid(1) + + n_features = value.shape[0] + if j >= n_features: + return + + value_j = value[j] - # backward - level = step * lmbd - if abs(tmp) <= level: - tmp = 0. - elif tmp > level: - tmp = tmp - level + if abs(value_j) <= level: + value_j = 0. + elif value_j > level: + value_j = value_j - level else: - tmp = tmp + level + value_j = value_j + level - out[j] = tmp + out[j] = value_j @cuda.jit From 34cc4a8d7a49a43f004ee0f550a603a9859cffd4 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Wed, 12 Apr 2023 11:31:18 +0200 Subject: [PATCH 33/52] unittest numba && dev utils --- skglm/gpu/tests/test_fista.py | 10 ++++-- skglm/gpu/tests/test_solver.py | 57 --------------------------------- skglm/gpu/tests/test_utils.py | 7 ++++ skglm/gpu/utils/device_utils.py | 16 +++++++++ 4 files changed, 31 insertions(+), 59 deletions(-) delete mode 100644 skglm/gpu/tests/test_solver.py diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index 3ffab2f87..f9d85434d 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -10,6 +10,8 @@ from skglm.gpu.solvers.cupy_solver import CupySolver, L1CuPy from skglm.gpu.solvers.jax_solver import JaxSolver, QuadraticJax, L1Jax +from skglm.gpu.solvers.numba_solver import NumbaSolver, QuadraticNumba, L1Numba + from skglm.gpu.utils.host_utils import eval_opt_crit @@ -18,11 +20,15 @@ [(CPUSolver(), BaseQuadratic, BaseL1), (CupySolver(), BaseQuadratic, L1CuPy), (JaxSolver(use_auto_diff=True), QuadraticJax, L1Jax), - (JaxSolver(use_auto_diff=False), QuadraticJax, L1Jax)]) + (JaxSolver(use_auto_diff=False), QuadraticJax, L1Jax), + (NumbaSolver(), QuadraticNumba, L1Numba)]) @pytest.mark.parametrize("sparse_X", [True, False]) def test_solves(sparse_X, solver, datafit_cls, penalty_cls): + if sparse_X and isinstance(solver, NumbaSolver): + pytest.xfail(reason="Sparse X is not yet supported for Numba") + random_state = 1265 - n_samples, n_features = 100, 30 + n_samples, n_features = 10, 3 reg = 1e-2 # generate dummy data diff --git a/skglm/gpu/tests/test_solver.py b/skglm/gpu/tests/test_solver.py deleted file mode 100644 index 924a7064a..000000000 --- a/skglm/gpu/tests/test_solver.py +++ /dev/null @@ -1,57 +0,0 @@ -import pytest - -import numpy as np -from numpy.linalg import norm -from scipy import sparse - -from skglm.gpu.solvers import CPUSolver, CupySolver, JaxSolver, NumbaSolver - -from skglm.gpu.utils.host_utils import eval_opt_crit - - -@pytest.mark.parametrize("solver", [CupySolver(), - CPUSolver(), - JaxSolver(use_auto_diff=False), - JaxSolver(use_auto_diff=True), - NumbaSolver()]) -def test_solves(solver): - random_state = 1265 - n_samples, n_features = 100, 30 - reg = 1e-2 - - # generate dummy data - rng = np.random.RandomState(random_state) - X = rng.randn(n_samples, n_features) - y = rng.randn(n_samples) - - # set lambda - lmbd_max = norm(X.T @ y, ord=np.inf) - lmbd = reg * lmbd_max - - w = solver.solve(X, y, lmbd) - - stop_crit = eval_opt_crit(X, y, lmbd, w) - - np.testing.assert_allclose(stop_crit, 0., atol=1e-9) - - -@pytest.mark.parametrize("solver", [CPUSolver(), CupySolver()]) -def test_solves_sparse(solver): - random_state = 1265 - n_samples, n_features = 100, 30 - reg = 1e-2 - - # generate dummy data - rng = np.random.RandomState(random_state) - X = sparse.rand(n_samples, n_features, density=0.1, - format="csr", random_state=rng) - y = rng.randn(n_samples) - - # set lambda - lmbd_max = norm(X.T @ y, ord=np.inf) - lmbd = reg * lmbd_max - - w = solver.solve(X, y, lmbd) - - stop_crit = eval_opt_crit(X, y, lmbd, w) - np.testing.assert_allclose(stop_crit, 0., atol=1e-7) diff --git a/skglm/gpu/tests/test_utils.py b/skglm/gpu/tests/test_utils.py index 43daee2a2..3274f5358 100644 --- a/skglm/gpu/tests/test_utils.py +++ b/skglm/gpu/tests/test_utils.py @@ -1,5 +1,6 @@ import numpy as np from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit +from skglm.gpu.utils.device_utils import get_device_properties from sklearn.linear_model import Lasso @@ -33,3 +34,9 @@ def test_eval_optimality(): eval_opt_crit(X, y, lmbd, estimator.coef_), 0., atol=1e-9 ) + + +def test_device_props(): + # check it runs and result is a dict + dev_props = get_device_properties() + np.testing.assert_equal(type(dev_props), dict) diff --git a/skglm/gpu/utils/device_utils.py b/skglm/gpu/utils/device_utils.py index e69de29bb..ef6d331cd 100644 --- a/skglm/gpu/utils/device_utils.py +++ b/skglm/gpu/utils/device_utils.py @@ -0,0 +1,16 @@ +from numba import cuda +from numba.cuda.cudadrv import enums + + +# modified version of code in +# https://stackoverflow.com/questions/62457151/access-gpu-hardware-specifications-in-python # noqa +def get_device_properties(): + device = cuda.get_current_device() + + device_props_name = [name.replace("CU_DEVICE_ATTRIBUTE_", "") + for name in dir(enums) + if name.startswith("CU_DEVICE_ATTRIBUTE_")] + + device_props_value = [getattr(device, prop) for prop in device_props_name] + + return dict(zip(device_props_name, device_props_value)) From b6f971c200f85ccf6ecf535aa4579a2f422235b5 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Wed, 12 Apr 2023 13:38:28 +0200 Subject: [PATCH 34/52] comments && prob formula --- skglm/gpu/__init__.py | 2 +- skglm/gpu/solvers/numba_solver.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/skglm/gpu/__init__.py b/skglm/gpu/__init__.py index 2a6008fdc..c7fda3ae0 100644 --- a/skglm/gpu/__init__.py +++ b/skglm/gpu/__init__.py @@ -2,5 +2,5 @@ Problem reads:: - min_w (1/2) * ||y - Xw||^2 + lmbd * ||w||_1 + min_w (1/2n) * ||y - Xw||^2 + lmbd * ||w||_1 """ diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index 50b6ecc0b..db6426616 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -58,6 +58,7 @@ def solve(self, X, y, datafit, penalty): # inplace update of grad datafit.gradient(X_gpu, y_gpu, mid_w, grad) + # inplace update of mid_w _forward[n_blocks_axis_1, MAX_1DIM_BLOCK](mid_w, grad, step, mid_w) # inplace update of w From a7d1375520535b7b37bb8cec0a14af7abcfd17fb Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Wed, 12 Apr 2023 16:06:48 +0200 Subject: [PATCH 35/52] Numba with shared memory --- skglm/gpu/solvers/numba_solver.py | 45 +++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index db6426616..85fc34d7e 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -1,5 +1,6 @@ import math import numpy as np +import numba from numba import cuda from skglm.gpu.solvers.base import BaseL1, BaseQuadratic, BaseFistaSolver @@ -109,6 +110,50 @@ def gradient(self, X_gpu, y_gpu, w, out): _compute_grad[n_blocks_axis_1, MAX_1DIM_BLOCK](X_gpu, minus_residual, out) + def gradient_2(self, X_gpu, y_gpu, w, out): + minus_residual = cuda.device_array(X_gpu.shape[0]) + + grid_dim = tuple(math.ceil(X_gpu.shape[idx] / MAX_2DIM_BLOCK[idx]) + for idx in range(2)) + + _compute_minus_residual_2[grid_dim, MAX_2DIM_BLOCK]( + X_gpu, y_gpu, w, minus_residual) + + n_blocks_axis_1 = math.ceil(X_gpu.shape[1] / MAX_1DIM_BLOCK[0]) + _compute_grad[n_blocks_axis_1, MAX_1DIM_BLOCK](X_gpu, minus_residual, out) + + +@cuda.jit +def _compute_minus_residual_2(X_gpu, y_gpu, w, out): + i, j = cuda.grid(2) + + n_samples, n_features = X_gpu.shape + if i >= n_samples or j >= n_features: + return + + t_i, t_j = cuda.threadIdx.x, cuda.threadIdx.y + sub_shape = MAX_2DIM_BLOCK # cuda.blockDim.x, cuda.blockDim.y + + sub_X = cuda.shared.array(shape=sub_shape, dtype=numba.f8) + sub_y = cuda.shared.array(shape=sub_shape[0], dtype=numba.f8) + sub_w = cuda.shared.array(shape=sub_shape[1], dtype=numba.f8) + + # load data in shared memory + sub_X[t_i, t_j] = X_gpu[i, j] + sub_y[t_i] = y_gpu[i] + sub_w[t_j] = w[j] + + cuda.syncthreads() + + tmp = 0. + for k in range(sub_shape[1]): + tmp += sub_X[t_i, k] * sub_w[k] + tmp -= sub_y[t_i] + + cuda.syncthreads() + + out[i] += tmp + class L1Numba(BaseL1): From c786a12c8d7d7a6a7ed32d46e42427ae05ecfcd7 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Thu, 13 Apr 2023 15:19:36 +0200 Subject: [PATCH 36/52] Numba shared memory version --- skglm/gpu/solvers/numba_solver.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index 85fc34d7e..c9c98868c 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -126,33 +126,26 @@ def gradient_2(self, X_gpu, y_gpu, w, out): @cuda.jit def _compute_minus_residual_2(X_gpu, y_gpu, w, out): i, j = cuda.grid(2) - n_samples, n_features = X_gpu.shape - if i >= n_samples or j >= n_features: - return + stride_x = cuda.gridDim.x * cuda.blockDim.x + stride_y = cuda.gridDim.y * cuda.blockDim.y t_i, t_j = cuda.threadIdx.x, cuda.threadIdx.y - sub_shape = MAX_2DIM_BLOCK # cuda.blockDim.x, cuda.blockDim.y + sub_shape = MAX_2DIM_BLOCK sub_X = cuda.shared.array(shape=sub_shape, dtype=numba.f8) - sub_y = cuda.shared.array(shape=sub_shape[0], dtype=numba.f8) sub_w = cuda.shared.array(shape=sub_shape[1], dtype=numba.f8) - # load data in shared memory - sub_X[t_i, t_j] = X_gpu[i, j] - sub_y[t_i] = y_gpu[i] - sub_w[t_j] = w[j] + for ii in range(i, n_samples, stride_x): + for jj in range(j, n_features, stride_y): - cuda.syncthreads() - - tmp = 0. - for k in range(sub_shape[1]): - tmp += sub_X[t_i, k] * sub_w[k] - tmp -= sub_y[t_i] + # load data in shared memory + sub_X[t_i, t_j] = X_gpu[ii, jj] + sub_w[t_j] = w[jj] - cuda.syncthreads() + cuda.syncthreads() - out[i] += tmp + cuda.atomic.add(out, ii, sub_X[t_i, t_j] * sub_w[t_j]) class L1Numba(BaseL1): From 2c4cd63d89e70937e27a92f1b7a9adbae5f17736 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Thu, 13 Apr 2023 15:47:17 +0200 Subject: [PATCH 37/52] kernels as static methods && Numba fix tests --- skglm/gpu/solvers/numba_solver.py | 128 +++++++++++------------------- skglm/gpu/tests/test_fista.py | 2 +- 2 files changed, 47 insertions(+), 83 deletions(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index c9c98868c..c83c2a175 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -1,6 +1,5 @@ import math import numpy as np -import numba from numba import cuda from skglm.gpu.solvers.base import BaseL1, BaseQuadratic, BaseFistaSolver @@ -12,7 +11,7 @@ # Built from GPU properties -# Refer to utils to get GPU properties +# Refer to `utils` to get GPU properties MAX_1DIM_BLOCK = (1024,) MAX_2DIM_BLOCK = (32, 32) MAX_1DIM_GRID = (65535,) @@ -105,47 +104,44 @@ def gradient(self, X_gpu, y_gpu, w, out): n_blocks_axis_0, n_blocks_axis_1 = (math.ceil(n / MAX_1DIM_BLOCK[0]) for n in X_gpu.shape) - _compute_minus_residual[n_blocks_axis_0, MAX_1DIM_BLOCK]( + QuadraticNumba._compute_minus_residual[n_blocks_axis_0, MAX_1DIM_BLOCK]( X_gpu, y_gpu, w, minus_residual) - _compute_grad[n_blocks_axis_1, MAX_1DIM_BLOCK](X_gpu, minus_residual, out) + QuadraticNumba._compute_grad[n_blocks_axis_1, MAX_1DIM_BLOCK]( + X_gpu, minus_residual, out) - def gradient_2(self, X_gpu, y_gpu, w, out): - minus_residual = cuda.device_array(X_gpu.shape[0]) - - grid_dim = tuple(math.ceil(X_gpu.shape[idx] / MAX_2DIM_BLOCK[idx]) - for idx in range(2)) - - _compute_minus_residual_2[grid_dim, MAX_2DIM_BLOCK]( - X_gpu, y_gpu, w, minus_residual) + @staticmethod + @cuda.jit + def _compute_minus_residual(X_gpu, y_gpu, w, out): + # compute: out = X_gpu @ w - y_gpu + i = cuda.grid(1) - n_blocks_axis_1 = math.ceil(X_gpu.shape[1] / MAX_1DIM_BLOCK[0]) - _compute_grad[n_blocks_axis_1, MAX_1DIM_BLOCK](X_gpu, minus_residual, out) - - -@cuda.jit -def _compute_minus_residual_2(X_gpu, y_gpu, w, out): - i, j = cuda.grid(2) - n_samples, n_features = X_gpu.shape + n_samples, n_features = X_gpu.shape + if i >= n_samples: + return - stride_x = cuda.gridDim.x * cuda.blockDim.x - stride_y = cuda.gridDim.y * cuda.blockDim.y - t_i, t_j = cuda.threadIdx.x, cuda.threadIdx.y + tmp = 0. + for j in range(n_features): + tmp += X_gpu[i, j] * w[j] + tmp -= y_gpu[i] - sub_shape = MAX_2DIM_BLOCK - sub_X = cuda.shared.array(shape=sub_shape, dtype=numba.f8) - sub_w = cuda.shared.array(shape=sub_shape[1], dtype=numba.f8) + out[i] = tmp - for ii in range(i, n_samples, stride_x): - for jj in range(j, n_features, stride_y): + @staticmethod + @cuda.jit + def _compute_grad(X_gpu, minus_residual, out): + # compute: out = X.T @ minus_residual + j = cuda.grid(1) - # load data in shared memory - sub_X[t_i, t_j] = X_gpu[ii, jj] - sub_w[t_j] = w[jj] + n_samples, n_features = X_gpu.shape + if j >= n_features: + return - cuda.syncthreads() + tmp = 0. + for i in range(n_samples): + tmp += X_gpu[i, j] * minus_residual[i] / X_gpu.shape[0] - cuda.atomic.add(out, ii, sub_X[t_i, t_j] * sub_w[t_j]) + out[j] = tmp class L1Numba(BaseL1): @@ -155,42 +151,30 @@ def prox(self, value, stepsize, out): n_blocks = math.ceil(value.shape[0] / MAX_1DIM_BLOCK[0]) - _ST_vec[n_blocks, MAX_1DIM_BLOCK](value, level, out) - - -@cuda.jit -def _compute_minus_residual(X_gpu, y_gpu, w, out): - # compute: out = X_gpu @ w - y_gpu - i = cuda.grid(1) - - n_samples, n_features = X_gpu.shape - if i >= n_samples: - return - - tmp = 0. - for j in range(n_features): - tmp += X_gpu[i, j] * w[j] - tmp -= y_gpu[i] - - out[i] = tmp + L1Numba._ST_vec[n_blocks, MAX_1DIM_BLOCK](value, level, out) + @staticmethod + @cuda.jit + def _ST_vec(value, level, out): + j = cuda.grid(1) -@cuda.jit -def _compute_grad(X_gpu, minus_residual, out): - # compute: out = X.T @ minus_residual - j = cuda.grid(1) + n_features = value.shape[0] + if j >= n_features: + return - n_samples, n_features = X_gpu.shape - if j >= n_features: - return + value_j = value[j] - tmp = 0. - for i in range(n_samples): - tmp += X_gpu[i, j] * minus_residual[i] / X_gpu.shape[0] + if abs(value_j) <= level: + value_j = 0. + elif value_j > level: + value_j = value_j - level + else: + value_j = value_j + level - out[j] = tmp + out[j] = value_j +# solver kernels @cuda.jit def _forward(mid_w, grad, step, out): j = cuda.grid(1) @@ -202,26 +186,6 @@ def _forward(mid_w, grad, step, out): out[j] = mid_w[j] - step * grad[j] -@cuda.jit -def _ST_vec(value, level, out): - j = cuda.grid(1) - - n_features = value.shape[0] - if j >= n_features: - return - - value_j = value[j] - - if abs(value_j) <= level: - value_j = 0. - elif value_j > level: - value_j = value_j - level - else: - value_j = value_j + level - - out[j] = value_j - - @cuda.jit def _extrapolate(w, old_w, coef, out): # compute: out = w + coef * (w - old_w) diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index f9d85434d..fdbde5ae1 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -28,7 +28,7 @@ def test_solves(sparse_X, solver, datafit_cls, penalty_cls): pytest.xfail(reason="Sparse X is not yet supported for Numba") random_state = 1265 - n_samples, n_features = 10, 3 + n_samples, n_features = 100, 30 reg = 1e-2 # generate dummy data From e3ac70aa6a02bbb7bafbafb57589f274d6be950b Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Thu, 13 Apr 2023 16:55:45 +0200 Subject: [PATCH 38/52] sparse Numba solver --- skglm/gpu/solvers/numba_solver.py | 67 ++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index c83c2a175..47fea15b3 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -2,6 +2,8 @@ import numpy as np from numba import cuda +from scipy import sparse + from skglm.gpu.solvers.base import BaseL1, BaseQuadratic, BaseFistaSolver import warnings @@ -26,6 +28,7 @@ def __init__(self, max_iter=1000, verbose=0): def solve(self, X, y, datafit, penalty): n_samples, n_features = X.shape + X_is_sparse = sparse.issparse(X) # compute step lipschitz = datafit.get_lipschitz_cst(X) @@ -38,7 +41,15 @@ def solve(self, X, y, datafit, penalty): n_blocks_axis_1 = math.ceil(X.shape[1] / MAX_1DIM_BLOCK[0]) # transfer to device - X_gpu = cuda.to_device(X) + if X_is_sparse: + X_gpu_bundles = ( + cuda.to_device(X.data), + cuda.to_device(X.indptr), + cuda.to_device(X.indices), + X.shape, + ) + else: + X_gpu = cuda.to_device(X) y_gpu = cuda.to_device(y) # init vars on device @@ -56,7 +67,10 @@ def solve(self, X, y, datafit, penalty): for it in range(self.max_iter): # inplace update of grad - datafit.gradient(X_gpu, y_gpu, mid_w, grad) + if X_is_sparse: + datafit.sparse_gradient(*X_gpu_bundles, y_gpu, mid_w, grad) + else: + datafit.gradient(X_gpu, y_gpu, mid_w, grad) # inplace update of mid_w _forward[n_blocks_axis_1, MAX_1DIM_BLOCK](mid_w, grad, step, mid_w) @@ -110,6 +124,20 @@ def gradient(self, X_gpu, y_gpu, w, out): QuadraticNumba._compute_grad[n_blocks_axis_1, MAX_1DIM_BLOCK]( X_gpu, minus_residual, out) + def sparse_gradient(self, X_gpu_data, X_gpu_indptr, X_gpu_indices, X_gpu_shape, + y_gpu, w, out): + minus_residual = cuda.device_array(X_gpu_shape[0]) + + n_blocks = math.ceil(X_gpu_shape[1] / MAX_1DIM_BLOCK[0]) + + QuadraticNumba._sparse_compute_minus_residual[n_blocks, MAX_1DIM_BLOCK]( + X_gpu_data, X_gpu_indptr, X_gpu_indices, X_gpu_shape, + y_gpu, w, minus_residual) + + QuadraticNumba._sparse_compute_grad[n_blocks, MAX_1DIM_BLOCK]( + X_gpu_data, X_gpu_indptr, X_gpu_indices, X_gpu_shape, + minus_residual, out) + @staticmethod @cuda.jit def _compute_minus_residual(X_gpu, y_gpu, w, out): @@ -143,6 +171,41 @@ def _compute_grad(X_gpu, minus_residual, out): out[j] = tmp + @staticmethod + @cuda.jit + def _sparse_compute_minus_residual(X_gpu_data, X_gpu_indptr, X_gpu_indices, + X_gpu_shape, y_gpu, w, out): + j = cuda.grid(1) + + n_samples, n_features = X_gpu_shape + if j >= n_features: + return + + stride_x = cuda.gridDim.x * cuda.blockDim.x + for jj in range(j, n_samples, stride_x): + cuda.atomic.add(out, jj, -y_gpu[jj]) + + for idx in range(X_gpu_indptr[j], X_gpu_indptr[j+1]): + i = X_gpu_indices[idx] + cuda.atomic.add(out, i, w[j] * X_gpu_data[idx]) + + @staticmethod + @cuda.jit + def _sparse_compute_grad(X_gpu_data, X_gpu_indptr, X_gpu_indices, X_gpu_shape, + minus_residual, out): + j = cuda.grid(1) + + n_features = X_gpu_shape[1] + if j >= n_features: + return + + tmp = 0. + for idx in range(X_gpu_indptr[j], X_gpu_indptr[j+1]): + i = X_gpu_indices[idx] + tmp += X_gpu_data[idx] * minus_residual[i] + + out[j] = tmp + class L1Numba(BaseL1): From ce4367df2f2b8ba1b683374daf56ca1540cd455f Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Thu, 13 Apr 2023 17:54:00 +0200 Subject: [PATCH 39/52] fix bug numba gradient --- skglm/gpu/solvers/numba_solver.py | 11 +++++------ skglm/gpu/tests/test_fista.py | 3 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index 47fea15b3..be0a8b7c9 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -83,7 +83,7 @@ def solve(self, X, y, datafit, penalty): p_obj = datafit.value(X, y, w_cpu, X @ w_cpu) + penalty.value(w_cpu) - datafit.gradient(X_gpu, y_gpu, w, grad) + datafit.sparse_gradient(*X_gpu_bundles, y_gpu, w, grad) grad_cpu = grad.copy_to_host() opt_crit = penalty.max_subdiff_distance(w_cpu, grad_cpu) @@ -181,9 +181,8 @@ def _sparse_compute_minus_residual(X_gpu_data, X_gpu_indptr, X_gpu_indices, if j >= n_features: return - stride_x = cuda.gridDim.x * cuda.blockDim.x - for jj in range(j, n_samples, stride_x): - cuda.atomic.add(out, jj, -y_gpu[jj]) + for jj in range(j, n_samples, n_features): + cuda.atomic.sub(out, jj, y_gpu[jj]) for idx in range(X_gpu_indptr[j], X_gpu_indptr[j+1]): i = X_gpu_indices[idx] @@ -195,14 +194,14 @@ def _sparse_compute_grad(X_gpu_data, X_gpu_indptr, X_gpu_indices, X_gpu_shape, minus_residual, out): j = cuda.grid(1) - n_features = X_gpu_shape[1] + n_samples, n_features = X_gpu_shape if j >= n_features: return tmp = 0. for idx in range(X_gpu_indptr[j], X_gpu_indptr[j+1]): i = X_gpu_indices[idx] - tmp += X_gpu_data[idx] * minus_residual[i] + tmp += X_gpu_data[idx] * minus_residual[i] / n_samples out[j] = tmp diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index fdbde5ae1..745c4567f 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -48,8 +48,9 @@ def test_solves(sparse_X, solver, datafit_cls, penalty_cls): stop_crit = eval_opt_crit(X, y, lmbd, w) - np.testing.assert_allclose(stop_crit, 0., atol=1e-9) + np.testing.assert_allclose(stop_crit, 0., atol=1e-8) if __name__ == "__main__": + # test_solves(True, NumbaSolver(verbose=1, max_iter=100), QuadraticNumba, L1Numba) pass From c8fd8a1af37a5d3c645cb30d8362a31a995fde6a Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 14 Apr 2023 11:18:42 +0200 Subject: [PATCH 40/52] fix bug numba sparse residual --- skglm/gpu/solvers/cpu_solver.py | 2 +- skglm/gpu/solvers/numba_solver.py | 7 +++++-- skglm/gpu/tests/test_fista.py | 4 ---- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/skglm/gpu/solvers/cpu_solver.py b/skglm/gpu/solvers/cpu_solver.py index f27f89e5e..779627866 100644 --- a/skglm/gpu/solvers/cpu_solver.py +++ b/skglm/gpu/solvers/cpu_solver.py @@ -37,7 +37,7 @@ def solve(self, X, y, datafit, penalty): if self.verbose: p_obj = datafit.value(X, y, w, X @ w) + penalty.value(w) - grad = datafit.gradient(X, y, mid_w, X @ mid_w) + grad = datafit.gradient(X, y, w, X @ w) opt_crit = penalty.max_subdiff_distance(w, grad) print( diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index be0a8b7c9..ba2bf3c73 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -83,7 +83,10 @@ def solve(self, X, y, datafit, penalty): p_obj = datafit.value(X, y, w_cpu, X @ w_cpu) + penalty.value(w_cpu) - datafit.sparse_gradient(*X_gpu_bundles, y_gpu, w, grad) + if X_is_sparse: + datafit.sparse_gradient(*X_gpu_bundles, y_gpu, w, grad) + else: + datafit.gradient(X_gpu, y_gpu, w, grad) grad_cpu = grad.copy_to_host() opt_crit = penalty.max_subdiff_distance(w_cpu, grad_cpu) @@ -126,7 +129,7 @@ def gradient(self, X_gpu, y_gpu, w, out): def sparse_gradient(self, X_gpu_data, X_gpu_indptr, X_gpu_indices, X_gpu_shape, y_gpu, w, out): - minus_residual = cuda.device_array(X_gpu_shape[0]) + minus_residual = cuda.to_device(np.zeros(X_gpu_shape[0])) n_blocks = math.ceil(X_gpu_shape[1] / MAX_1DIM_BLOCK[0]) diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index 745c4567f..3e4aff810 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -24,9 +24,6 @@ (NumbaSolver(), QuadraticNumba, L1Numba)]) @pytest.mark.parametrize("sparse_X", [True, False]) def test_solves(sparse_X, solver, datafit_cls, penalty_cls): - if sparse_X and isinstance(solver, NumbaSolver): - pytest.xfail(reason="Sparse X is not yet supported for Numba") - random_state = 1265 n_samples, n_features = 100, 30 reg = 1e-2 @@ -52,5 +49,4 @@ def test_solves(sparse_X, solver, datafit_cls, penalty_cls): if __name__ == "__main__": - # test_solves(True, NumbaSolver(verbose=1, max_iter=100), QuadraticNumba, L1Numba) pass From a6df22ad0ee84e268450f749324328a3bc280176 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 14 Apr 2023 11:20:23 +0200 Subject: [PATCH 41/52] n_samples instead of shape --- skglm/gpu/solvers/numba_solver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index ba2bf3c73..13ce25453 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -170,7 +170,7 @@ def _compute_grad(X_gpu, minus_residual, out): tmp = 0. for i in range(n_samples): - tmp += X_gpu[i, j] * minus_residual[i] / X_gpu.shape[0] + tmp += X_gpu[i, j] * minus_residual[i] / n_samples out[j] = tmp From cf5dc9ef696099e67cb5c15e5b791e7876d69a39 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 14 Apr 2023 11:27:52 +0200 Subject: [PATCH 42/52] Numba_solver: striding for scalable kernels --- skglm/gpu/solvers/numba_solver.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index 13ce25453..2f511421c 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -244,11 +244,11 @@ def _ST_vec(value, level, out): def _forward(mid_w, grad, step, out): j = cuda.grid(1) - n_features = len(mid_w) - if j >= n_features: - return + n_features = mid_w.shape[0] + stride_y = cuda.gridDim.x * cuda.blockDim.x - out[j] = mid_w[j] - step * grad[j] + for jj in range(j, n_features, stride_y): + out[jj] = mid_w[jj] - step * grad[jj] @cuda.jit @@ -256,8 +256,8 @@ def _extrapolate(w, old_w, coef, out): # compute: out = w + coef * (w - old_w) j = cuda.grid(1) - n_features = len(w) - if j >= n_features: - return + n_features = w.shape[0] + stride_y = cuda.gridDim.x * cuda.blockDim.x - out[j] = w[j] + coef * (w[j] - old_w[j]) + for jj in range(j, n_features, stride_y): + out[jj] = w[jj] + coef * (w[jj] - old_w[jj]) From 20f927431ed2aee5cf44ac774588e809d5784e66 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 14 Apr 2023 11:31:21 +0200 Subject: [PATCH 43/52] Numba_L1: striding for scalable kernels --- skglm/gpu/solvers/numba_solver.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index 2f511421c..a962f81a3 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -224,19 +224,19 @@ def _ST_vec(value, level, out): j = cuda.grid(1) n_features = value.shape[0] - if j >= n_features: - return + stride_y = cuda.gridDim.x * cuda.blockDim.x - value_j = value[j] + for jj in range(j, n_features, stride_y): + value_j = value[jj] - if abs(value_j) <= level: - value_j = 0. - elif value_j > level: - value_j = value_j - level - else: - value_j = value_j + level + if abs(value_j) <= level: + value_j = 0. + elif value_j > level: + value_j = value_j - level + else: + value_j = value_j + level - out[j] = value_j + out[jj] = value_j # solver kernels From d8d7157efc51f9b66283b60bcc6afd093b0732a8 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 14 Apr 2023 11:46:28 +0200 Subject: [PATCH 44/52] Numba sparse datafit: striding --- skglm/gpu/solvers/numba_solver.py | 32 +++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index a962f81a3..fef51fc47 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -181,15 +181,19 @@ def _sparse_compute_minus_residual(X_gpu_data, X_gpu_indptr, X_gpu_indices, j = cuda.grid(1) n_samples, n_features = X_gpu_shape - if j >= n_features: - return + stride_y = cuda.gridDim.x * cuda.blockDim.x - for jj in range(j, n_samples, n_features): - cuda.atomic.sub(out, jj, y_gpu[jj]) + for jj in range(j, n_features, stride_y): + + # out -= y_gpu + # small hack to perform this operation using + # the (features) threads instead of launching others + for idx in range(jj, n_samples, n_features): + cuda.atomic.sub(out, idx, y_gpu[idx]) - for idx in range(X_gpu_indptr[j], X_gpu_indptr[j+1]): - i = X_gpu_indices[idx] - cuda.atomic.add(out, i, w[j] * X_gpu_data[idx]) + for idx in range(X_gpu_indptr[jj], X_gpu_indptr[jj+1]): + i = X_gpu_indices[idx] + cuda.atomic.add(out, i, w[jj] * X_gpu_data[idx]) @staticmethod @cuda.jit @@ -198,15 +202,15 @@ def _sparse_compute_grad(X_gpu_data, X_gpu_indptr, X_gpu_indices, X_gpu_shape, j = cuda.grid(1) n_samples, n_features = X_gpu_shape - if j >= n_features: - return + stride_y = cuda.gridDim.x * cuda.blockDim.x - tmp = 0. - for idx in range(X_gpu_indptr[j], X_gpu_indptr[j+1]): - i = X_gpu_indices[idx] - tmp += X_gpu_data[idx] * minus_residual[i] / n_samples + for jj in range(j, n_features, stride_y): + tmp = 0. + for idx in range(X_gpu_indptr[jj], X_gpu_indptr[jj+1]): + i = X_gpu_indices[idx] + tmp += X_gpu_data[idx] * minus_residual[i] / n_samples - out[j] = tmp + out[jj] = tmp class L1Numba(BaseL1): From 4e4e6c1a5b4451011d4ca09730d817108a0571d9 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 14 Apr 2023 11:50:11 +0200 Subject: [PATCH 45/52] Numba dense datafit: striding --- skglm/gpu/solvers/numba_solver.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index fef51fc47..c09fab715 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -148,15 +148,16 @@ def _compute_minus_residual(X_gpu, y_gpu, w, out): i = cuda.grid(1) n_samples, n_features = X_gpu.shape - if i >= n_samples: - return + stride_x = cuda.gridDim.x * cuda.blockDim.x - tmp = 0. - for j in range(n_features): - tmp += X_gpu[i, j] * w[j] - tmp -= y_gpu[i] + for ii in range(i, n_samples, stride_x): - out[i] = tmp + tmp = 0. + for j in range(n_features): + tmp += X_gpu[ii, j] * w[j] + tmp -= y_gpu[ii] + + out[ii] = tmp @staticmethod @cuda.jit @@ -165,14 +166,15 @@ def _compute_grad(X_gpu, minus_residual, out): j = cuda.grid(1) n_samples, n_features = X_gpu.shape - if j >= n_features: - return + stride_y = cuda.gridDim.x * cuda.blockDim.x - tmp = 0. - for i in range(n_samples): - tmp += X_gpu[i, j] * minus_residual[i] / n_samples + for jj in range(j, n_features, stride_y): - out[j] = tmp + tmp = 0. + for i in range(n_samples): + tmp += X_gpu[i, jj] * minus_residual[i] / n_samples + + out[jj] = tmp @staticmethod @cuda.jit From 1d07d9e894f15a91eafe95fcd77da11ab6becc1c Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 14 Apr 2023 13:56:32 +0200 Subject: [PATCH 46/52] info comments Numba solver --- skglm/gpu/solvers/numba_solver.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/skglm/gpu/solvers/numba_solver.py b/skglm/gpu/solvers/numba_solver.py index c09fab715..96f30c5b3 100644 --- a/skglm/gpu/solvers/numba_solver.py +++ b/skglm/gpu/solvers/numba_solver.py @@ -129,6 +129,8 @@ def gradient(self, X_gpu, y_gpu, w, out): def sparse_gradient(self, X_gpu_data, X_gpu_indptr, X_gpu_indices, X_gpu_shape, y_gpu, w, out): + # init as zero as it is used in the computation + # otherwise stale values on GPU are used minus_residual = cuda.to_device(np.zeros(X_gpu_shape[0])) n_blocks = math.ceil(X_gpu_shape[1] / MAX_1DIM_BLOCK[0]) @@ -152,6 +154,7 @@ def _compute_minus_residual(X_gpu, y_gpu, w, out): for ii in range(i, n_samples, stride_x): + # out[ii] = X_gpu[i, :] @ w - y_gpu tmp = 0. for j in range(n_features): tmp += X_gpu[ii, j] * w[j] @@ -170,6 +173,7 @@ def _compute_grad(X_gpu, minus_residual, out): for jj in range(j, n_features, stride_y): + # out[jj] = X_gpu[:, jj] @ minus_residual / n_samples tmp = 0. for i in range(n_samples): tmp += X_gpu[i, jj] * minus_residual[i] / n_samples @@ -193,6 +197,7 @@ def _sparse_compute_minus_residual(X_gpu_data, X_gpu_indptr, X_gpu_indices, for idx in range(jj, n_samples, n_features): cuda.atomic.sub(out, idx, y_gpu[idx]) + # out[i] = w[jj] * X_gpu[:, jj] for idx in range(X_gpu_indptr[jj], X_gpu_indptr[jj+1]): i = X_gpu_indices[idx] cuda.atomic.add(out, i, w[jj] * X_gpu_data[idx]) @@ -207,6 +212,8 @@ def _sparse_compute_grad(X_gpu_data, X_gpu_indptr, X_gpu_indices, X_gpu_shape, stride_y = cuda.gridDim.x * cuda.blockDim.x for jj in range(j, n_features, stride_y): + + # out[jj] = X_gpu[:, jj] @ minus_residual / n_samples tmp = 0. for idx in range(X_gpu_indptr[jj], X_gpu_indptr[jj+1]): i = X_gpu_indices[idx] @@ -232,6 +239,7 @@ def _ST_vec(value, level, out): n_features = value.shape[0] stride_y = cuda.gridDim.x * cuda.blockDim.x + # out = ST(value, level) for jj in range(j, n_features, stride_y): value_j = value[jj] @@ -253,17 +261,18 @@ def _forward(mid_w, grad, step, out): n_features = mid_w.shape[0] stride_y = cuda.gridDim.x * cuda.blockDim.x + # out = mid_w - step * grad for jj in range(j, n_features, stride_y): out[jj] = mid_w[jj] - step * grad[jj] @cuda.jit def _extrapolate(w, old_w, coef, out): - # compute: out = w + coef * (w - old_w) j = cuda.grid(1) n_features = w.shape[0] stride_y = cuda.gridDim.x * cuda.blockDim.x + # out = w + coef * (w - old_w) for jj in range(j, n_features, stride_y): out[jj] = w[jj] + coef * (w[jj] - old_w[jj]) From ca9f69499d35e630de0031e2002dcb706b7934ab Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 14 Apr 2023 16:26:01 +0200 Subject: [PATCH 47/52] update installation && normalize df and pen cupy --- skglm/gpu/README.md | 5 ++++- skglm/gpu/solvers/cupy_solver.py | 6 +++++- skglm/gpu/tests/test_fista.py | 4 ++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/skglm/gpu/README.md b/skglm/gpu/README.md index 2dcf69256..6fc49b761 100644 --- a/skglm/gpu/README.md +++ b/skglm/gpu/README.md @@ -27,7 +27,10 @@ pip install skglm -e . 4. install dependencies ```shell # cupy -conda conda install -c conda-forge cupy cudatoolkit=11.5 +conda install -c conda-forge cupy cudatoolkit=11.5 + +# pytorch +pip install torch # jax conda install jaxlib=*=*cuda* jax cuda-nvcc -c conda-forge -c nvidia diff --git a/skglm/gpu/solvers/cupy_solver.py b/skglm/gpu/solvers/cupy_solver.py index 43203e086..0cdb72303 100644 --- a/skglm/gpu/solvers/cupy_solver.py +++ b/skglm/gpu/solvers/cupy_solver.py @@ -4,7 +4,7 @@ import numpy as np from scipy import sparse -from skglm.gpu.solvers.base import BaseFistaSolver, BaseL1 +from skglm.gpu.solvers.base import BaseFistaSolver, BaseL1, BaseQuadratic class CupySolver(BaseFistaSolver): @@ -71,6 +71,10 @@ def solve(self, X, y, datafit, penalty): return w_cpu +class QuadraticCuPy(BaseQuadratic): + pass + + class L1CuPy(BaseL1): def prox(self, value, stepsize): diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index 3e4aff810..4d1822da2 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -8,8 +8,8 @@ from skglm.gpu.solvers import CPUSolver from skglm.gpu.solvers.base import BaseQuadratic, BaseL1 -from skglm.gpu.solvers.cupy_solver import CupySolver, L1CuPy from skglm.gpu.solvers.jax_solver import JaxSolver, QuadraticJax, L1Jax +from skglm.gpu.solvers.cupy_solver import CupySolver, QuadraticCuPy, L1CuPy from skglm.gpu.solvers.numba_solver import NumbaSolver, QuadraticNumba, L1Numba @@ -18,7 +18,7 @@ @pytest.mark.parametrize("solver, datafit_cls, penalty_cls", [(CPUSolver(), BaseQuadratic, BaseL1), - (CupySolver(), BaseQuadratic, L1CuPy), + (CupySolver(), QuadraticCuPy, L1CuPy), (JaxSolver(use_auto_diff=True), QuadraticJax, L1Jax), (JaxSolver(use_auto_diff=False), QuadraticJax, L1Jax), (NumbaSolver(), QuadraticNumba, L1Numba)]) From 324cac59f68e73853cf6d4b9810975acbe034170 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 14 Apr 2023 17:56:08 +0200 Subject: [PATCH 48/52] pytorch solver [buggy] --- skglm/gpu/solvers/pytorch_solver.py | 105 ++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 skglm/gpu/solvers/pytorch_solver.py diff --git a/skglm/gpu/solvers/pytorch_solver.py b/skglm/gpu/solvers/pytorch_solver.py new file mode 100644 index 000000000..93a0b659a --- /dev/null +++ b/skglm/gpu/solvers/pytorch_solver.py @@ -0,0 +1,105 @@ +import torch + +import numpy as np +from scipy import sparse + +from skglm.gpu.solvers.base import BaseFistaSolver, BaseQuadratic, BaseL1 + + +class PytorchSolver(BaseFistaSolver): + + def __init__(self, max_iter=1000, use_auto_diff=True, verbose=0): + self.max_iter = max_iter + self.use_auto_diff = use_auto_diff + self.verbose = verbose + + def solve(self, X, y, datafit, penalty): + n_samples, n_features = X.shape + X_is_sparse = sparse.issparse(X) + + # compute step + lipschitz = datafit.get_lipschitz_cst(X) + if lipschitz == 0.: + return np.zeros(n_features) + + step = 1 / lipschitz + + # transfer data + selected_device = torch.device("cuda") + X_gpu = torch.tensor(X, requires_grad=False, device=selected_device) + y_gpu = torch.tensor(y, requires_grad=False, device=selected_device) + + # init vars + w = torch.zeros(n_features, dtype=torch.float64, device=selected_device) + old_w = torch.zeros(n_features, dtype=torch.float64, device=selected_device) + mid_w = torch.zeros(n_features, dtype=torch.float64, device=selected_device, + requires_grad=self.use_auto_diff) + + t_old, t_new = 1, 1 + + for it in range(self.max_iter): + + # compute gradient + if self.use_auto_diff: + datafit_value = datafit.value(X_gpu, y_gpu, mid_w) + datafit_value.backward() + + grad = mid_w.grad + else: + grad = datafit.gradient(X_gpu, y_gpu, mid_w) + + # forward / backward + with torch.no_grad(): + w = penalty.prox(mid_w - grad, step) + + if self.verbose: + # transfer back to host + w_cpu = w.cpu().numpy() + + p_obj = datafit.value(X, y, w_cpu) + penalty.value(w_cpu) + + if self.use_auto_diff: + w_tmp = torch.tensor(w, dtype=torch.float64, + device=selected_device, requires_grad=True) + + datafit_value = datafit.value(X_gpu, y_gpu, w_tmp) + datafit_value.backward() + + grad_cpu = w_tmp.grad.detach().cpu().numpy() + else: + grad_cpu = datafit.gradient(X, y, w_cpu) + + opt_crit = penalty.max_subdiff_distance(w_cpu, grad_cpu) + + print( + f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}" + ) + + # extrapolate + mid_w = w + ((t_old - 1) / t_new) * (w - old_w) + mid_w = mid_w.requires_grad_(self.use_auto_diff) + + # update FISTA vars + t_old = t_new + t_new = 0.5 * (1 + np.sqrt(1. + 4. * t_old ** 2)) + # no need to copy `w` since its update (forward/backward) + # creates a new instance + old_w = w + + # transfer back to host + w_cpu = w.cpu().numpy() + + return w_cpu + + +class QuadraticPytorch(BaseQuadratic): + + def value(self, X, y, w): + return ((y - X @ w) ** 2).sum() / (2 * len(y)) + + +class L1Pytorch(BaseL1): + + def prox(self, value, stepsize): + shifted_value = torch.abs(value) - stepsize * self.alpha + return torch.sign(value) * torch.maximum(shifted_value, torch.tensor(0.)) From c5c1dfefddab3c89f6ef751dc3a934426c41bacb Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 14 Apr 2023 18:13:51 +0200 Subject: [PATCH 49/52] fix grad bug pytorch solver && unittest --- skglm/gpu/solvers/pytorch_solver.py | 9 ++++++--- skglm/gpu/tests/test_fista.py | 24 ++++++++++++++++-------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/skglm/gpu/solvers/pytorch_solver.py b/skglm/gpu/solvers/pytorch_solver.py index 93a0b659a..ed274ae59 100644 --- a/skglm/gpu/solvers/pytorch_solver.py +++ b/skglm/gpu/solvers/pytorch_solver.py @@ -26,8 +26,8 @@ def solve(self, X, y, datafit, penalty): # transfer data selected_device = torch.device("cuda") - X_gpu = torch.tensor(X, requires_grad=False, device=selected_device) - y_gpu = torch.tensor(y, requires_grad=False, device=selected_device) + X_gpu = torch.tensor(X, device=selected_device) + y_gpu = torch.tensor(y, device=selected_device) # init vars w = torch.zeros(n_features, dtype=torch.float64, device=selected_device) @@ -50,7 +50,7 @@ def solve(self, X, y, datafit, penalty): # forward / backward with torch.no_grad(): - w = penalty.prox(mid_w - grad, step) + w = penalty.prox(mid_w - step * grad, step) if self.verbose: # transfer back to host @@ -97,6 +97,9 @@ class QuadraticPytorch(BaseQuadratic): def value(self, X, y, w): return ((y - X @ w) ** 2).sum() / (2 * len(y)) + def gradient(self, X, y, w): + return X.T @ (X @ w - y) / X.shape[0] + class L1Pytorch(BaseL1): diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index 4d1822da2..7d72438e4 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -11,19 +11,27 @@ from skglm.gpu.solvers.jax_solver import JaxSolver, QuadraticJax, L1Jax from skglm.gpu.solvers.cupy_solver import CupySolver, QuadraticCuPy, L1CuPy from skglm.gpu.solvers.numba_solver import NumbaSolver, QuadraticNumba, L1Numba - +from skglm.gpu.solvers.pytorch_solver import PytorchSolver, QuadraticPytorch, L1Pytorch from skglm.gpu.utils.host_utils import eval_opt_crit -@pytest.mark.parametrize("solver, datafit_cls, penalty_cls", - [(CPUSolver(), BaseQuadratic, BaseL1), - (CupySolver(), QuadraticCuPy, L1CuPy), - (JaxSolver(use_auto_diff=True), QuadraticJax, L1Jax), - (JaxSolver(use_auto_diff=False), QuadraticJax, L1Jax), - (NumbaSolver(), QuadraticNumba, L1Numba)]) @pytest.mark.parametrize("sparse_X", [True, False]) -def test_solves(sparse_X, solver, datafit_cls, penalty_cls): +@pytest.mark.parametrize( + "solver, datafit_cls, penalty_cls", + [ + (CPUSolver(), BaseQuadratic, BaseL1), + (CupySolver(), QuadraticCuPy, L1CuPy), + (JaxSolver(use_auto_diff=True), QuadraticJax, L1Jax), + (JaxSolver(use_auto_diff=False), QuadraticJax, L1Jax), + (PytorchSolver(use_auto_diff=True), QuadraticPytorch, L1Pytorch), + (PytorchSolver(use_auto_diff=False), QuadraticPytorch, L1Pytorch), + (NumbaSolver(), QuadraticNumba, L1Numba) + ]) +def test_solves(solver, datafit_cls, penalty_cls, sparse_X): + if sparse_X and isinstance(solver, PytorchSolver): + pytest.xfail(reason="Sparse data still not yet supported") + random_state = 1265 n_samples, n_features = 100, 30 reg = 1e-2 From 32f1014c2c9595fb7e07ebecda5b6072b5377297 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Sun, 16 Apr 2023 12:01:57 +0200 Subject: [PATCH 50/52] pytorch solver sparse data --- skglm/gpu/solvers/pytorch_solver.py | 9 ++++++++- skglm/gpu/tests/test_fista.py | 3 --- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/skglm/gpu/solvers/pytorch_solver.py b/skglm/gpu/solvers/pytorch_solver.py index ed274ae59..d878f1761 100644 --- a/skglm/gpu/solvers/pytorch_solver.py +++ b/skglm/gpu/solvers/pytorch_solver.py @@ -26,7 +26,14 @@ def solve(self, X, y, datafit, penalty): # transfer data selected_device = torch.device("cuda") - X_gpu = torch.tensor(X, device=selected_device) + if X_is_sparse: + X_gpu = torch.sparse_csc_tensor( + X.indptr, X.indices, X.data, X.shape, + dtype=torch.float64, + device=selected_device + ) + else: + X_gpu = torch.tensor(X, device=selected_device) y_gpu = torch.tensor(y, device=selected_device) # init vars diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index 7d72438e4..f42b9bc32 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -29,9 +29,6 @@ (NumbaSolver(), QuadraticNumba, L1Numba) ]) def test_solves(solver, datafit_cls, penalty_cls, sparse_X): - if sparse_X and isinstance(solver, PytorchSolver): - pytest.xfail(reason="Sparse data still not yet supported") - random_state = 1265 n_samples, n_features = 100, 30 reg = 1e-2 From 0caa9f9c6955122982285a3fa365cdb888655f35 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Sun, 16 Apr 2023 13:34:23 +0200 Subject: [PATCH 51/52] set order between jax pytorch && xfail sparse and auto_diff false --- skglm/gpu/solvers/pytorch_solver.py | 8 ++++++++ skglm/gpu/tests/test_fista.py | 7 +++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/skglm/gpu/solvers/pytorch_solver.py b/skglm/gpu/solvers/pytorch_solver.py index d878f1761..341950396 100644 --- a/skglm/gpu/solvers/pytorch_solver.py +++ b/skglm/gpu/solvers/pytorch_solver.py @@ -17,6 +17,14 @@ def solve(self, X, y, datafit, penalty): n_samples, n_features = X.shape X_is_sparse = sparse.issparse(X) + if X_is_sparse and not self.use_auto_diff: + error_message = ( + "PyTorch doesn't support the operation `M.T @ vec`" + "for sparse matrices. Use `use_auto_diff=True`" + ) + + raise ValueError(error_message) + # compute step lipschitz = datafit.get_lipschitz_cst(X) if lipschitz == 0.: diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index f42b9bc32..ffe013e3b 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -22,13 +22,16 @@ [ (CPUSolver(), BaseQuadratic, BaseL1), (CupySolver(), QuadraticCuPy, L1CuPy), - (JaxSolver(use_auto_diff=True), QuadraticJax, L1Jax), - (JaxSolver(use_auto_diff=False), QuadraticJax, L1Jax), (PytorchSolver(use_auto_diff=True), QuadraticPytorch, L1Pytorch), (PytorchSolver(use_auto_diff=False), QuadraticPytorch, L1Pytorch), + (JaxSolver(use_auto_diff=True), QuadraticJax, L1Jax), + (JaxSolver(use_auto_diff=False), QuadraticJax, L1Jax), (NumbaSolver(), QuadraticNumba, L1Numba) ]) def test_solves(solver, datafit_cls, penalty_cls, sparse_X): + if (sparse_X and isinstance(solver, PytorchSolver) and not solver.use_auto_diff): + pytest.xfail(reason="PyTorch doesn't support `M.T @ vec` for sparse matrices") + random_state = 1265 n_samples, n_features = 100, 30 reg = 1e-2 From 545a27f40c2a530703cd6428df4994c667919708 Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Sun, 16 Apr 2023 13:41:17 +0200 Subject: [PATCH 52/52] test on obj value --- skglm/gpu/tests/test_fista.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/skglm/gpu/tests/test_fista.py b/skglm/gpu/tests/test_fista.py index ffe013e3b..b8d3afac0 100644 --- a/skglm/gpu/tests/test_fista.py +++ b/skglm/gpu/tests/test_fista.py @@ -5,6 +5,8 @@ import numpy as np from numpy.linalg import norm +from skglm.estimators import Lasso + from skglm.gpu.solvers import CPUSolver from skglm.gpu.solvers.base import BaseQuadratic, BaseL1 @@ -13,7 +15,7 @@ from skglm.gpu.solvers.numba_solver import NumbaSolver, QuadraticNumba, L1Numba from skglm.gpu.solvers.pytorch_solver import PytorchSolver, QuadraticPytorch, L1Pytorch -from skglm.gpu.utils.host_utils import eval_opt_crit +from skglm.gpu.utils.host_utils import eval_opt_crit, compute_obj @pytest.mark.parametrize("sparse_X", [True, False]) @@ -50,10 +52,15 @@ def test_solves(solver, datafit_cls, penalty_cls, sparse_X): lmbd = reg * lmbd_max w = solver.solve(X, y, datafit_cls(), penalty_cls(lmbd)) + estimator = Lasso(alpha=lmbd, fit_intercept=False).fit(X, y) stop_crit = eval_opt_crit(X, y, lmbd, w) np.testing.assert_allclose(stop_crit, 0., atol=1e-8) + np.testing.assert_allclose( + compute_obj(X, y, lmbd, estimator.coef_), + compute_obj(X, y, lmbd, w), + ) if __name__ == "__main__":