Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
d89cb0d
new 32b script
hamishivi Nov 10, 2025
7494bad
new 32b script
hamishivi Nov 10, 2025
b35eb27
beaker eval freq not upstreaming
hamishivi Nov 10, 2025
a152aac
new var
hamishivi Nov 10, 2025
d221104
longer timeout on capturing cuda
hamishivi Nov 10, 2025
afaab18
longer timeout on capturing cuda
hamishivi Nov 10, 2025
a423960
update params
hamishivi Nov 11, 2025
f025002
reduce more
hamishivi Nov 11, 2025
812fb8b
no optim
hamishivi Nov 11, 2025
b441c9c
working script
hamishivi Nov 11, 2025
1667054
zpg inc
hamishivi Nov 11, 2025
09fda58
newer changes
hamishivi Nov 11, 2025
16e2758
higher zpg
hamishivi Nov 11, 2025
c210b1f
changes
hamishivi Nov 11, 2025
25d6856
fix
hamishivi Nov 11, 2025
fbdfa3a
zpg as arg
hamishivi Nov 11, 2025
53dde6e
debug
hamishivi Nov 11, 2025
1c4809e
update
hamishivi Nov 11, 2025
80e998a
update
hamishivi Nov 11, 2025
70aab0b
del tmp script
hamishivi Nov 11, 2025
9876bb8
oom test
hamishivi Nov 11, 2025
20d70d0
simpler solution
hamishivi Nov 11, 2025
5136b84
remove ref policy
hamishivi Nov 11, 2025
da78e98
update smoke test
hamishivi Nov 12, 2025
e3fd9a8
Merge branch 'main' into pad-out-32b
hamishivi Nov 12, 2025
5095af4
remove oom testing comment
hamishivi Nov 12, 2025
762bce3
beaker eval freq
hamishivi Nov 12, 2025
67ffff2
lint
hamishivi Nov 12, 2025
52aa926
fix up
hamishivi Nov 12, 2025
8b032f9
fix up
hamishivi Nov 12, 2025
861e677
zero std
hamishivi Nov 12, 2025
1268de9
trying something
hamishivi Nov 12, 2025
af816ab
trying something
hamishivi Nov 12, 2025
c1b3990
First commit for Slack integration.
finbarrtimbers Nov 13, 2025
eb5a719
Fixed import error
finbarrtimbers Nov 13, 2025
fae5cbd
Undid changes to script
finbarrtimbers Nov 13, 2025
7d7fadf
update error message
finbarrtimbers Nov 13, 2025
12b5b5c
Removed fake error
finbarrtimbers Nov 13, 2025
633d2bd
fux workspace
hamishivi Nov 13, 2025
3b1c24a
Merge branch 'finbarr/add-slack-grpo' into pad-out-32b
hamishivi Nov 13, 2025
e4c2408
fix
hamishivi Nov 13, 2025
33f4dec
fix
hamishivi Nov 13, 2025
744f3d1
fix
hamishivi Nov 13, 2025
6e3b6c8
emegrency patch
hamishivi Nov 15, 2025
bf4c26f
Now we are on top of hamish's branch (#1197)
finbarrtimbers Nov 15, 2025
bc5b289
clean up regular ckpts
hamishivi Nov 15, 2025
30c420c
clean up regular ckpts
hamishivi Nov 15, 2025
d71695c
clean up regular ckpts
hamishivi Nov 15, 2025
5b11468
clean up regular ckpts
hamishivi Nov 15, 2025
42aa63c
quick fix
hamishivi Nov 15, 2025
e8b48ca
fixes to gs checkpoint logic
hamishivi Nov 15, 2025
4d48b7f
Revert "fixes to gs checkpoint logic"
hamishivi Nov 15, 2025
bc756b0
fixes to gs checkpoint logic
hamishivi Nov 15, 2025
b1ccc5d
fixes to gs checkpoint logic
hamishivi Nov 15, 2025
8d79ca8
fixes to gs checkpoint logic
hamishivi Nov 15, 2025
f7540af
fixes to gs checkpoint logic
hamishivi Nov 15, 2025
8d8232c
longer timeout
hamishivi Nov 15, 2025
dc7dc3d
clean up
hamishivi Nov 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 80 additions & 7 deletions mason.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@
import time

import beaker
import requests
from rich.console import Console
from rich.text import Text

from open_instruct.utils import GCP_CLUSTERS, INTERCONNECT_CLUSTERS, WEKA_CLUSTERS

console = Console()


Expand Down Expand Up @@ -87,11 +90,6 @@ def parse_env_var(env_var_str: str) -> dict[str, str]:
return {"name": name, "value": value}


WEKA_CLUSTERS = ["ai2/jupiter", "ai2/saturn", "ai2/titan", "ai2/neptune", "ai2/ceres", "ai2/triton", "ai2/rhea"]
GCP_CLUSTERS = ["ai2/augusta"]

INTERCONNECT_CLUSTERS = ["ai2/jupiter", "ai2/ceres", "ai2/titan", "ai2/augusta"]

# by default, we turn off vllm compile cache
# torch compile caching seems consistently broken, but the actual compiling isn't.
# Not sure why, for now we have disabled the caching (VLLM_DISABLE_COMPILE_CACHE=1).
Expand Down Expand Up @@ -292,6 +290,7 @@ def get_env_vars(
"AZURE_API_KEY",
"AZURE_API_BASE",
"ANTHROPIC_API_KEY",
"SLACK_WEBHOOK",
]
for useful_secret in useful_secrets:
if f"{whoami}_{useful_secret}" in beaker_secrets:
Expand Down Expand Up @@ -827,8 +826,82 @@ def main():
budget=args.budget,
retry=beaker.BeakerRetrySpec(allowed_task_retries=args.max_retries),
)
exp = beaker_client.experiment.create(spec=experiment_spec)
console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}")

# Increase timeout for HTTP requests and add retry logic
# The beaker library uses requests internally, and large experiment specs can take longer to process
max_retries = 3
retry_delay = 5 # Start with 5 seconds
timeout_seconds = 300 # Increase timeout to 300 seconds (5 minutes) for large experiment specs

# Monkey-patch requests.Session to intercept and increase timeout values
# The beaker library hardcodes a 5-second timeout, so we need to patch at the requests level
original_session_request = requests.Session.request
original_session_post = requests.Session.post

def patched_session_request(self, method, url, **kwargs):
# Override timeout if it's set to a low value (less than our desired timeout)
if "timeout" in kwargs and kwargs["timeout"] is not None:
current_timeout = kwargs["timeout"]
# Handle tuple timeouts (connect, read) or single value
if isinstance(current_timeout, tuple):
if len(current_timeout) == 2 and current_timeout[1] < timeout_seconds:
kwargs["timeout"] = (current_timeout[0], timeout_seconds)
elif isinstance(current_timeout, (int, float)) and current_timeout < timeout_seconds:
kwargs["timeout"] = timeout_seconds
elif "timeout" not in kwargs:
kwargs["timeout"] = timeout_seconds
return original_session_request(self, method, url, **kwargs)

def patched_session_post(self, url, **kwargs):
# Override timeout if it's set to a low value
if "timeout" in kwargs and kwargs["timeout"] is not None:
current_timeout = kwargs["timeout"]
if isinstance(current_timeout, tuple):
if len(current_timeout) == 2 and current_timeout[1] < timeout_seconds:
kwargs["timeout"] = (current_timeout[0], timeout_seconds)
elif isinstance(current_timeout, (int, float)) and current_timeout < timeout_seconds:
kwargs["timeout"] = timeout_seconds
elif "timeout" not in kwargs:
kwargs["timeout"] = timeout_seconds
return original_session_post(self, url, **kwargs)

# Apply the patches
requests.Session.request = patched_session_request
requests.Session.post = patched_session_post
console.log(f"✅ Patched requests.Session to use minimum {timeout_seconds} second timeout")

# Also try to increase the timeout on the beaker client itself
try:
if hasattr(beaker_client, "_timeout"):
beaker_client._timeout = timeout_seconds
console.log(f"✅ Set beaker client timeout to {timeout_seconds} seconds")
except Exception as e:
console.log(f"⚠️ Could not modify beaker client timeout: {e}. Will rely on retries.")

# Retry logic with exponential backoff for timeout errors
for attempt in range(max_retries):
try:
exp = beaker_client.experiment.create(spec=experiment_spec)
console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}")
break
except (requests.exceptions.ReadTimeout, requests.exceptions.Timeout):
if attempt < max_retries - 1:
wait_time = retry_delay * (2**attempt) # Exponential backoff: 5s, 10s, 20s
console.log(
f"⚠️ Timeout occurred (attempt {attempt + 1}/{max_retries}). "
f"Retrying in {wait_time} seconds... "
f"Large experiment specs may take longer to process."
)
time.sleep(wait_time)
else:
console.log(
f"❌ Failed to create Beaker experiment after {max_retries} attempts due to timeout. "
f"The experiment spec may be too large or the Beaker API may be experiencing issues."
)
raise
except Exception:
# For other exceptions, don't retry
raise


if __name__ == "__main__":
Expand Down
Loading