scikit-learn-contrib · MatthewSZhang · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/.github/workflows/asv.yml b/.github/workflows/asv.yml
@@ -0,0 +1,100 @@
+name: ASV benchmarks
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+
+jobs:
+  run-benchmarks:
+    name: Benchmark on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    permissions:
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest, ubuntu-24.04-arm]
+
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+
+      - name: Set up main branch
+        run: |
+          git branch main origin/main
+
+      - uses: prefix-dev/[email protected]
+        with:
+          environments: dev
+          cache: true
+
+      - name: Run benchmarks
+        shell: bash
+        run: |
+          MACHINE=${{ matrix.os }} pixi run asv-build
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        with:
+          name: asv-results-${{ matrix.os }}
+          path: asv_benchmarks/results
+
+  publish-report:
+    name: Build HTML report
+    runs-on: ubuntu-latest
+    needs: run-benchmarks
+    permissions:
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: Prepare previous ASV results
+        uses: actions/checkout@v5
+        with:
+          ref: gh-pages
+          path: gh-pages
+
+      - name: Copy previous results
+        run: |
+          mkdir -p asv_benchmarks/results
+          cp -r gh-pages/results/* asv_benchmarks/results/ 2>/dev/null || true
+
+      - name: Download all benchmark results
+        uses: actions/download-artifact@v5
+        with:
+          pattern: asv-results-*
+
+      - name: Merge new benchmark results
+        run: |
+          for d in asv-results-*; do
+            [ -d "$d" ] || continue
+            cp -r "$d"/* "asv_benchmarks/results/"
+          done
+
+      - uses: prefix-dev/[email protected]
+        with:
+          environments: dev
+          cache: true
+
+      - name: Generate HTML report
+        run: |
+          pixi run asv-publish
+
+      - name: Copy results to publish directory
+        run: |
+          cp -r asv_benchmarks/results html/
+
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@v4
+        if: github.event_name == 'push'
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./html
+          keep_files: true
+          user_name: 'github-actions[bot]'
+          user_email: 'github-actions[bot]@users.noreply.github.com'
+          commit_message: ${{ github.event.head_commit.message }}
diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml
@@ -7,7 +7,7 @@ jobs:
   build-wasm-wheel:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
       - name: Build WASM wheel
         uses: pypa/[email protected]
         env:

diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml
@@ -18,7 +18,7 @@ jobs:
       id-token: write
     steps:
       - name: Download artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v5
         with:
           path: dist/
           merge-multiple: true
@@ -27,9 +27,9 @@ jobs:
       - name: Publish distribution to PyPI
         if: github.event.release.prerelease == false
         uses: pypa/gh-action-pypi-publish@release/v1
-      
+
       - name: get wasm dist artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v5
         with:
           name: wasm_wheel
           path: wasm/

diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml
@@ -8,8 +8,8 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v4
-      - uses: prefix-dev/setup-pixi@v0.8.14
+      - uses: actions/checkout@v5
+      - uses: prefix-dev/setup-pixi@v0.9.0
         with:
           environments: static
           cache: true

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -15,8 +15,8 @@ jobs:
     runs-on: ${{ matrix.os }}
 
     steps:
-      - uses: actions/checkout@v4
-      - uses: prefix-dev/setup-pixi@v0.8.14
+      - uses: actions/checkout@v5
+      - uses: prefix-dev/setup-pixi@v0.9.0
         with:
           environments: >-
             dev

diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml
@@ -7,8 +7,8 @@ jobs:
   build-sdist:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-      - uses: prefix-dev/setup-pixi@v0.8.14
+      - uses: actions/checkout@v5
+      - uses: prefix-dev/setup-pixi@v0.9.0
         with:
           environments: dev
           cache: true
@@ -31,7 +31,7 @@ jobs:
         os: [ubuntu-latest, windows-latest, macos-latest, ubuntu-24.04-arm]
     runs-on: ${{ matrix.os }}
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
       - name: Build wheels
         uses: pypa/[email protected]
         env:

diff --git a/asv_benchmarks/.gitignore b/asv_benchmarks/.gitignore
@@ -0,0 +1,5 @@
+*__pycache__*
+env/
+html/
+results/
+benchmarks/cache/
diff --git a/asv_benchmarks/asv.conf.json b/asv_benchmarks/asv.conf.json
@@ -0,0 +1,18 @@
+{
+    "version": 1,
+    "project": "fastcan",
+    "project_url": "https://github.com/scikit-learn-contrib/fastcan",
+    "show_commit_url": "https://github.com/scikit-learn-contrib/fastcan/commit/",
+    "repo": "..",
+    "branches": ["main"],
+    "environment_type": "conda",
+    "conda_channels": ["conda-forge"],
+    "build_command": ["python -m build --wheel -o {build_cache_dir} {build_dir}"],
+    "install_command": ["python -mpip install {wheel_file}"],
+    "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
+    "pythons": ["3.13"],
+    "matrix": {
+        "scikit-learn": [""],
+        "pandas": [""]
+    }
+}
diff --git a/asv_benchmarks/benchmarks/__init__.py b/asv_benchmarks/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Benchmark suite for fastcan using ASV"""
diff --git a/asv_benchmarks/benchmarks/common.py b/asv_benchmarks/benchmarks/common.py
@@ -0,0 +1,53 @@
+import pickle
+import timeit
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+
+def get_estimator_path(benchmark, params):
+    """Get path of pickled fitted estimator"""
+    path = Path(__file__).resolve().parent / "cache" / "estimators"
+
+    filename = (
+        benchmark.__class__.__name__
+        + "_estimator_"
+        + "_".join(list(map(str, params)))
+        + ".pkl"
+    )
+
+    return path / filename
+
+
+class Benchmark(ABC):
+    """Abstract base class for all the benchmarks"""
+
+    timer = timeit.default_timer  # wall time
+    timeout = 500
+
+    # save estimators
+    current_path = Path(__file__).resolve().parent
+    cache_path = current_path / "cache"
+    cache_path.mkdir(exist_ok=True)
+    (cache_path / "estimators").mkdir(exist_ok=True)
+
+    def setup(self, *params):
+        """Generate dataset and load the fitted estimator"""
+        # This is run once per combination of parameters and per repeat so we
+        # need to avoid doing expensive operations there.
+
+        self.X, self.X_val, self.y, self.y_val = self.make_data(params)
+
+        est_path = get_estimator_path(self, params)
+        with est_path.open(mode="rb") as f:
+            self.estimator = pickle.load(f)
+
+    @abstractmethod
+    def make_data(self, params):
+        """Return the dataset for a combination of parameters"""
+        # The datasets are cached using joblib.Memory so it's fast and can be
+        # called for each repeat
+
+    @property
+    @abstractmethod
+    def params(self):
+        pass
diff --git a/asv_benchmarks/benchmarks/datasets.py b/asv_benchmarks/benchmarks/datasets.py
@@ -0,0 +1,56 @@
+from pathlib import Path
+
+import numpy as np
+from joblib import Memory
+from sklearn.datasets import (
+    fetch_openml,
+    load_digits,
+    make_regression,
+)
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MaxAbsScaler, StandardScaler
+
+# memory location for caching datasets
+M = Memory(location=str(Path(__file__).resolve().parent / "cache"))
+
+
+@M.cache
+def _digits_dataset(n_samples=None, dtype=np.float32):
+    X, y = load_digits(return_X_y=True)
+    X = X.astype(dtype, copy=False)
+    X = MaxAbsScaler().fit_transform(X)
+    X = X[:n_samples]
+    y = y[:n_samples]
+
+    X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0)
+    return X, X_val, y, y_val
+
+
+@M.cache
+def _synth_regression_dataset(n_samples=10000, n_features=200, dtype=np.float32):
+    X, y = make_regression(
+        n_samples=n_samples,
+        n_features=n_features,
+        n_informative=n_features // 10,
+        noise=50,
+        random_state=0,
+    )
+    X = X.astype(dtype, copy=False)
+    X = StandardScaler().fit_transform(X)
+
+    X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, random_state=0)
+    return X, X_val, y, y_val
+
+
+@M.cache
+def _co2_dataset(dtype=np.float32):
+    X, y = fetch_openml(data_id=41187, return_X_y=True, as_frame=False)
+    X = X[:, [1, 3]]
+    X = X.astype(dtype, copy=False)
+    n_samples = len(y)
+    n_test = int(n_samples * 0.1)
+
+    mask_train = np.arange(n_samples) < (n_samples - n_test)
+    X, X_val = X[mask_train], X[~mask_train]
+    y_train, y_val = y[mask_train], y[~mask_train]
+    return X, X_val, y_train, y_val
diff --git a/asv_benchmarks/benchmarks/fastcan.py b/asv_benchmarks/benchmarks/fastcan.py
@@ -0,0 +1,85 @@
+import itertools
+import pickle
+
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.linear_model import LinearRegression
+
+from fastcan import FastCan
+
+from .common import Benchmark, get_estimator_path
+from .datasets import _digits_dataset, _synth_regression_dataset
+
+
+class FastCanBenchmark(Benchmark):
+    """
+    Benchmarks for FastCan.
+    """
+
+    param_names = ["task", "alg"]
+    params = (["classif", "reg"], ["h", "eta"])
+
+    def setup_cache(self):
+        """Pickle a fitted estimator for all combinations of parameters"""
+        # This is run once per benchmark class.
+
+        param_grid = list(itertools.product(*self.params))
+
+        for params in param_grid:
+            _, alg = params
+            X, _, y, _ = self.make_data(params)
+
+            if alg == "h":
+                eta = False
+            else:
+                eta = True
+            estimator = FastCan(
+                n_features_to_select=20,
+                eta=eta,
+            )
+            estimator.fit(X, y)
+
+            est_path = get_estimator_path(self, params)
+            with est_path.open(mode="wb") as f:
+                pickle.dump(estimator, f)
+
+    def make_data(self, params):
+        task, _ = params
+        if task == "classif":
+            return _digits_dataset()
+        return _synth_regression_dataset()
+
+    def time_fit(self, *args):
+        self.estimator.fit(self.X, self.y)
+
+    def peakmem_fit(self, *args):
+        self.estimator.fit(self.X, self.y)
+
+    def track_train_score(self, *args):
+        task, _ = args
+        X_t = self.estimator.transform(self.X)
+        if task == "classif":
+            clf = LinearDiscriminantAnalysis()
+            clf.fit(X_t, self.y)
+            return float(clf.score(X_t, self.y))
+        else:
+            reg = LinearRegression()
+            reg.fit(X_t, self.y)
+            return float(reg.score(X_t, self.y))
+
+    def track_test_score(self, *args):
+        task, _ = args
+        X_t = self.estimator.transform(self.X_val)
+        if task == "classif":
+            clf = LinearDiscriminantAnalysis()
+            clf.fit(X_t, self.y_val)
+            return float(clf.score(X_t, self.y_val))
+        else:
+            reg = LinearRegression()
+            reg.fit(X_t, self.y_val)
+            return float(reg.score(X_t, self.y_val))
+
+    def time_transform(self, *args):
+        self.estimator.transform(self.X)
+
+    def peakmem_transform(self, *args):
+        self.estimator.transform(self.X)