feat: automatic batching for big data

MilesCranmer · MilesCranmer · commit 502cc803c973 · 2025-09-28T22:08:30.000-04:00
diff --git a/pysr/sr.py b/pysr/sr.py
@@ -265,7 +265,7 @@ class _DynamicallySetParams:
     operators: dict[int, list[str]]
     maxdepth: int
     constraints: dict[str, int | tuple[int, ...]]
-    batch_size: int
+    batch_size: int | None
     update_verbosity: int
     progress: bool
     warmup_maxsize_by: float
@@ -623,12 +623,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         List of module names as strings to import in worker processes.
         For example, `["MyPackage", "OtherPackage"]` will run `using MyPackage, OtherPackage`
         in each worker process. Default is `None`.
-    batching : bool
+    batching : bool | "auto"
         Whether to compare population members on small batches during
         evolution. Still uses full dataset for comparing against hall
-        of fame. Default is `False`.
-    batch_size : int
-        The amount of data to use if doing batching. Default is `50`.
+        of fame. Default is "auto", which enables batching for N≥1000.
+    batch_size : int | None
+        The batch size to use if batching. If None (default), uses
+        128 for N<5000, 256 for N<50000, or 512 for N≥50000.
     fast_cycle : bool
         Batch over population subsamples. This is a slightly different
         algorithm than regularized evolution, but does cycles 15%
@@ -934,8 +935,8 @@ def __init__(
         heap_size_hint_in_bytes: int | None = None,
         worker_timeout: float | None = None,
         worker_imports: list[str] | None = None,
-        batching: bool = False,
-        batch_size: int = 50,
+        batching: bool | Literal["auto"] = "auto",
+        batch_size: int | None = None,
         fast_cycle: bool = False,
         turbo: bool = False,
         bumper: bool = False,
@@ -2133,9 +2134,15 @@ def _run(
             maxsize=int(self.maxsize),
             output_directory=_escape_filename(self.output_directory_),
             npopulations=int(self.populations),
-            batching=self.batching,
+            # Determine actual batching based on "auto" mode
+            batching=(self.batching if self.batching != "auto" else len(X) >= 1000),
             batch_size=int(
-                min([runtime_params.batch_size, len(X)]) if self.batching else len(X)
+                _get_batch_size(len(X), runtime_params.batch_size)
+                if (
+                    self.batching == True
+                    or (self.batching == "auto" and len(X) >= 1000)
+                )
+                else len(X)
             ),
             mutation_weights=mutation_weights,
             tournament_selection_p=self.tournament_selection_p,
@@ -2389,15 +2396,12 @@ def fit(
             y_units,
         )
 
-        if X.shape[0] > 10000 and not self.batching:
+        if X.shape[0] > 50000:
             warnings.warn(
-                "Note: you are running with more than 10,000 datapoints. "
-                "You should consider turning on batching (https://ai.damtp.cam.ac.uk/pysr/options/#batching). "
-                "You should also reconsider if you need that many datapoints. "
-                "Unless you have a large amount of noise (in which case you "
-                "should smooth your dataset first), generally < 10,000 datapoints "
-                "is enough to find a functional form with symbolic regression. "
-                "More datapoints will lower the search speed."
+                "You are using a dataset with more than 50,000 datapoints. "
+                "Symbolic regression rarely benefits from this many points - consider "
+                "subsampling to 10,000 points or fewer. If you have high noise, "
+                "denoise the data first rather than using more points."
             )
 
         random_state = check_random_state(self.random_state)  # For np random
@@ -2980,8 +2984,22 @@ def _prepare_guesses_for_julia(guesses, nout) -> VectorValue | None:
     return jl_array(julia_guesses)
 
 
+def _get_batch_size(dataset_size: int, batch_size_param: int | None) -> int:
+    """Calculate the actual batch size to use."""
+    if batch_size_param is not None:
+        return min(dataset_size, batch_size_param)
+    elif dataset_size < 1000:
+        return dataset_size
+    elif dataset_size < 5000:
+        return 128
+    elif dataset_size < 50000:
+        return 256
+    else:
+        return 512
+
+
 def _mutate_parameter(param_name: str, param_value):
-    if param_name == "batch_size" and param_value < 1:
+    if param_name == "batch_size" and param_value is not None and param_value < 1:
         warnings.warn(
             "Given `batch_size` must be greater than or equal to one. "
             "`batch_size` has been increased to equal one."
diff --git a/pysr/test/test_main.py b/pysr/test/test_main.py
@@ -1257,6 +1257,59 @@ def test_load_all_packages(self):
         load_all_packages()
         self.assertTrue(jl.seval("ClusterManagers isa Module"))
 
+    def test_get_batch_size(self):
+        """Test the _get_batch_size function."""
+        from pysr.sr import _get_batch_size
+
+        # Test None (auto) mode with different dataset sizes
+        self.assertEqual(_get_batch_size(500, None), 500)
+        self.assertEqual(_get_batch_size(999, None), 999)
+        self.assertEqual(_get_batch_size(1000, None), 128)
+        self.assertEqual(_get_batch_size(1500, None), 128)
+        self.assertEqual(_get_batch_size(4999, None), 128)
+        self.assertEqual(_get_batch_size(5000, None), 256)
+        self.assertEqual(_get_batch_size(10000, None), 256)
+        self.assertEqual(_get_batch_size(49999, None), 256)
+        self.assertEqual(_get_batch_size(50000, None), 512)
+        self.assertEqual(_get_batch_size(100000, None), 512)
+
+        # Test explicit batch_size
+        self.assertEqual(_get_batch_size(1000, 64), 64)
+        self.assertEqual(_get_batch_size(1000, 2000), 1000)  # Capped at dataset size
+        self.assertEqual(_get_batch_size(50, 100), 50)  # Capped at dataset size
+
+    def test_batching_auto(self):
+        """Test that batching='auto' works correctly."""
+        model = PySRRegressor()
+        self.assertEqual(model.batching, "auto")
+
+        X_small = np.random.randn(100, 2)
+        y_small = np.random.randn(100)
+        model = PySRRegressor(batching="auto", niterations=0)
+        model.fit(X_small, y_small)
+
+        X_large = np.random.randn(1000, 2)
+        y_large = np.random.randn(1000)
+        model2 = PySRRegressor(batching="auto", niterations=0)
+        model2.fit(X_large, y_large)
+
+    def test_batch_size_negative_warning(self):
+        """Test that batch_size < 1 gives a warning for integers only."""
+        X = np.random.randn(10, 2)
+        y = np.random.randn(10)
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            with self.assertRaises(UserWarning) as context:
+                model = PySRRegressor(batch_size=0, niterations=0)
+                model.fit(X, y)
+            self.assertIn("batch_size", str(context.exception))
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            model = PySRRegressor(batch_size=None, niterations=0)
+            model.fit(X, y)
+
 
 class TestHelpMessages(unittest.TestCase):
     """Test user help messages."""
@@ -1301,13 +1354,13 @@ def test_power_law_warning(self):
     def test_size_warning(self):
         """Ensure that a warning is given for a large input size."""
         model = PySRRegressor()
-        X = np.random.randn(10001, 2)
-        y = np.random.randn(10001)
+        X = np.random.randn(50001, 2)
+        y = np.random.randn(50001)
         with warnings.catch_warnings():
             warnings.simplefilter("error")
             with self.assertRaises(Exception) as context:
                 model.fit(X, y)
-            self.assertIn("more than 10,000", str(context.exception))
+            self.assertIn("more than 50,000", str(context.exception))
 
     def test_deterministic_warnings(self):
         """Ensure that warnings are given for determinism"""