From 17f88654c2189a681abae3c44c225024d85fa572 Mon Sep 17 00:00:00 2001 From: Lucia Quirke Date: Mon, 10 Nov 2025 23:48:41 +0000 Subject: [PATCH 1/3] validate batch size before collecting --- bergson/collection.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/bergson/collection.py b/bergson/collection.py index 7370f17..5328a22 100644 --- a/bergson/collection.py +++ b/bergson/collection.py @@ -77,6 +77,8 @@ def callback(name: str, g: torch.Tensor, indices: list[int]): attention_cfgs=attention_cfgs, ) + validate_batch_size(model, token_batch_size, collector) + # Allocate space ahead of time for the gradients grad_sizes = {name: math.prod(s) for name, s in collector.shapes().items()} @@ -252,3 +254,21 @@ def process_preconditioners( preconditioners_eigen[name] = (eigval, eigvec) if rank == 0: processor.preconditioners_eigen = preconditioners_eigen + + +def validate_batch_size( + model: PreTrainedModel, + token_batch_size: int | None, + collector: GradientCollector, +): + """Validate that the specified token batch size fits on device.""" + if token_batch_size is None: + return + + random_tokens = torch.randint( + 0, 10, (1, token_batch_size), device=model.device, dtype=torch.long + ) + with collector: + loss = model(random_tokens).logits[0, 0, 0].float() + loss.backward() + model.zero_grad() From de789d514aeff2106d576c916eceb95ed1e5a871 Mon Sep 17 00:00:00 2001 From: Lucia Quirke Date: Thu, 13 Nov 2025 00:52:00 +0000 Subject: [PATCH 2/3] Fix error from rebase --- bergson/collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bergson/collection.py b/bergson/collection.py index 5328a22..a0b0036 100644 --- a/bergson/collection.py +++ b/bergson/collection.py @@ -77,7 +77,7 @@ def callback(name: str, g: torch.Tensor, indices: list[int]): attention_cfgs=attention_cfgs, ) - validate_batch_size(model, token_batch_size, collector) + validate_batch_size(model, cfg.token_batch_size, collector) # Allocate space ahead of time for the gradients grad_sizes = {name: math.prod(s) for name, s in collector.shapes().items()} From 6ec77b5d26470c784e8f36ec2003570dbbfa87e4 Mon Sep 17 00:00:00 2001 From: Lucia Quirke Date: Thu, 13 Nov 2025 00:54:48 +0000 Subject: [PATCH 3/3] Improve batch size validation message --- bergson/collection.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/bergson/collection.py b/bergson/collection.py index a0b0036..bb7050e 100644 --- a/bergson/collection.py +++ b/bergson/collection.py @@ -268,7 +268,13 @@ def validate_batch_size( random_tokens = torch.randint( 0, 10, (1, token_batch_size), device=model.device, dtype=torch.long ) - with collector: - loss = model(random_tokens).logits[0, 0, 0].float() - loss.backward() - model.zero_grad() + try: + with collector: + loss = model(random_tokens).logits[0, 0, 0].float() + loss.backward() + model.zero_grad() + except Exception as e: + raise ValueError( + f"Token batch size {token_batch_size} is too large for the device. " + f"Try reducing the batch size or use --fsdp to shard the model." + ) from e