Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 40 additions & 20 deletions pysr/sr.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ class _DynamicallySetParams:
operators: dict[int, list[str]]
maxdepth: int
constraints: dict[str, int | tuple[int, ...]]
batch_size: int
batch_size: int | None
update_verbosity: int
progress: bool
warmup_maxsize_by: float
Expand Down Expand Up @@ -623,12 +623,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
List of module names as strings to import in worker processes.
For example, `["MyPackage", "OtherPackage"]` will run `using MyPackage, OtherPackage`
in each worker process. Default is `None`.
batching : bool
batching : bool | "auto"
Whether to compare population members on small batches during
evolution. Still uses full dataset for comparing against hall
of fame. Default is `False`.
batch_size : int
The amount of data to use if doing batching. Default is `50`.
of fame. "auto" enables batching for N≥1000. Default is `"auto"`.
batch_size : int | None
The batch size to use if batching. If None, uses
128 for N<5000, 256 for N<50000, or 512 for N≥50000.
Default is `None`.
fast_cycle : bool
Batch over population subsamples. This is a slightly different
algorithm than regularized evolution, but does cycles 15%
Expand Down Expand Up @@ -934,8 +936,8 @@ def __init__(
heap_size_hint_in_bytes: int | None = None,
worker_timeout: float | None = None,
worker_imports: list[str] | None = None,
batching: bool = False,
batch_size: int = 50,
batching: bool | Literal["auto"] = "auto",
batch_size: int | None = None,
fast_cycle: bool = False,
turbo: bool = False,
bumper: bool = False,
Expand Down Expand Up @@ -1559,7 +1561,7 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams:
operators={2: ["+", "*", "-", "/"]},
maxdepth=self.maxsize,
constraints={},
batch_size=1,
batch_size=None,
update_verbosity=int(self.verbosity),
progress=self.progress,
warmup_maxsize_by=0.0,
Expand All @@ -1580,8 +1582,9 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams:

for param_name in map(lambda x: x.name, fields(_DynamicallySetParams)):
user_param_value = getattr(self, param_name)
if user_param_value is None:
if user_param_value is None and param_name != "batch_size":
# Leave as the default in DynamicallySetParams
# (except for batch_size, which we want to keep as None)
...
else:
# If user has specified it, we will override the default.
Expand Down Expand Up @@ -2133,9 +2136,15 @@ def _run(
maxsize=int(self.maxsize),
output_directory=_escape_filename(self.output_directory_),
npopulations=int(self.populations),
batching=self.batching,
# Determine actual batching based on "auto" mode
batching=(self.batching if self.batching != "auto" else len(X) >= 1000),
batch_size=int(
min([runtime_params.batch_size, len(X)]) if self.batching else len(X)
_get_batch_size(len(X), runtime_params.batch_size)
if (
self.batching == True
or (self.batching == "auto" and len(X) >= 1000)
)
else len(X)
),
mutation_weights=mutation_weights,
tournament_selection_p=self.tournament_selection_p,
Expand Down Expand Up @@ -2389,15 +2398,12 @@ def fit(
y_units,
)

if X.shape[0] > 10000 and not self.batching:
if X.shape[0] > 50000:
warnings.warn(
"Note: you are running with more than 10,000 datapoints. "
"You should consider turning on batching (https://ai.damtp.cam.ac.uk/pysr/options/#batching). "
"You should also reconsider if you need that many datapoints. "
"Unless you have a large amount of noise (in which case you "
"should smooth your dataset first), generally < 10,000 datapoints "
"is enough to find a functional form with symbolic regression. "
"More datapoints will lower the search speed."
"You are using a dataset with more than 50,000 datapoints. "
"Symbolic regression rarely benefits from this many points - consider "
"subsampling to 10,000 points or fewer. If you have high noise, "
"denoise the data first rather than using more points."
)

random_state = check_random_state(self.random_state) # For np random
Expand Down Expand Up @@ -2980,8 +2986,22 @@ def _prepare_guesses_for_julia(guesses, nout) -> VectorValue | None:
return jl_array(julia_guesses)


def _get_batch_size(dataset_size: int, batch_size_param: int | None) -> int:
"""Calculate the actual batch size to use."""
if batch_size_param is not None:
return min(dataset_size, batch_size_param)
elif dataset_size < 1000:
return dataset_size
elif dataset_size < 5000:
return 128
elif dataset_size < 50000:
return 256
else:
return 512


def _mutate_parameter(param_name: str, param_value):
if param_name == "batch_size" and param_value < 1:
if param_name == "batch_size" and param_value is not None and param_value < 1:
warnings.warn(
"Given `batch_size` must be greater than or equal to one. "
"`batch_size` has been increased to equal one."
Expand Down
59 changes: 56 additions & 3 deletions pysr/test/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1257,6 +1257,59 @@ def test_load_all_packages(self):
load_all_packages()
self.assertTrue(jl.seval("ClusterManagers isa Module"))

def test_get_batch_size(self):
"""Test the _get_batch_size function."""
from pysr.sr import _get_batch_size

# Test None (auto) mode with different dataset sizes
self.assertEqual(_get_batch_size(500, None), 500)
self.assertEqual(_get_batch_size(999, None), 999)
self.assertEqual(_get_batch_size(1000, None), 128)
self.assertEqual(_get_batch_size(1500, None), 128)
self.assertEqual(_get_batch_size(4999, None), 128)
self.assertEqual(_get_batch_size(5000, None), 256)
self.assertEqual(_get_batch_size(10000, None), 256)
self.assertEqual(_get_batch_size(49999, None), 256)
self.assertEqual(_get_batch_size(50000, None), 512)
self.assertEqual(_get_batch_size(100000, None), 512)

# Test explicit batch_size
self.assertEqual(_get_batch_size(1000, 64), 64)
self.assertEqual(_get_batch_size(1000, 2000), 1000) # Capped at dataset size
self.assertEqual(_get_batch_size(50, 100), 50) # Capped at dataset size

def test_batching_auto(self):
"""Test that batching='auto' works correctly."""
model = PySRRegressor()
self.assertEqual(model.batching, "auto")

X_small = np.random.randn(100, 2)
y_small = np.random.randn(100)
model = PySRRegressor(batching="auto", niterations=0)
model.fit(X_small, y_small)

X_large = np.random.randn(1000, 2)
y_large = np.random.randn(1000)
model2 = PySRRegressor(batching="auto", niterations=0)
model2.fit(X_large, y_large)

def test_batch_size_negative_warning(self):
"""Test that batch_size < 1 gives a warning for integers only."""
X = np.random.randn(10, 2)
y = np.random.randn(10)

with warnings.catch_warnings():
warnings.simplefilter("error")
with self.assertRaises(UserWarning) as context:
model = PySRRegressor(batch_size=0, niterations=0)
model.fit(X, y)
self.assertIn("batch_size", str(context.exception))

with warnings.catch_warnings():
warnings.simplefilter("error")
model = PySRRegressor(batch_size=None, niterations=0)
model.fit(X, y)


class TestHelpMessages(unittest.TestCase):
"""Test user help messages."""
Expand Down Expand Up @@ -1301,13 +1354,13 @@ def test_power_law_warning(self):
def test_size_warning(self):
"""Ensure that a warning is given for a large input size."""
model = PySRRegressor()
X = np.random.randn(10001, 2)
y = np.random.randn(10001)
X = np.random.randn(50001, 2)
y = np.random.randn(50001)
with warnings.catch_warnings():
warnings.simplefilter("error")
with self.assertRaises(Exception) as context:
model.fit(X, y)
self.assertIn("more than 10,000", str(context.exception))
self.assertIn("more than 50,000", str(context.exception))

def test_deterministic_warnings(self):
"""Ensure that warnings are given for determinism"""
Expand Down
Loading