Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
286453b
human in the loop
chisingh Feb 28, 2025
8abe599
test agent
chisingh Mar 2, 2025
3c689d2
replay history
chisingh Mar 2, 2025
1235dd5
mock human input
chisingh Mar 2, 2025
5588b95
unit test
chisingh Mar 3, 2025
0bf872a
test clone
chisingh Mar 3, 2025
31492e0
CodeQL fix
chisingh Mar 17, 2025
155daad
init terminal
chisingh Mar 18, 2025
3227253
history steps
chisingh Mar 20, 2025
c3c8141
Merge remote-tracking branch 'origin' into hitl_agent
chisingh Mar 28, 2025
2eab98e
unit tests
chisingh Mar 28, 2025
f00000c
Merge remote-tracking branch 'origin' into hitl_agent
chisingh May 21, 2025
f955752
Merge remote-tracking branch 'origin' into hitl_agent
chisingh May 21, 2025
0e4b25e
updated tests
chisingh May 21, 2025
34f06c1
Merge branch 'main' into hitl_agent
chisingh May 21, 2025
640ea36
Merge branch 'main' into hitl_agent
chisingh May 22, 2025
eb8241b
isort, black
chisingh May 22, 2025
af0e4c7
fix test
chisingh May 22, 2025
a894c33
format fix
chisingh May 22, 2025
1d3763d
Merge remote-tracking branch 'origin' into hitl_agent
chisingh May 22, 2025
30361cb
lint fix
chisingh May 22, 2025
581904f
Merge branch 'main' into hitl_agent
MarcCote May 23, 2025
114634e
clone tools
chisingh May 28, 2025
cd951bd
improve error message when llm config doesn't exist
MarcCote May 28, 2025
0909ce7
Merge branch 'main' into hitl_agent
MarcCote May 28, 2025
97b233c
Fix README and improve output of init_llm_config
MarcCote May 28, 2025
e6ce726
install pandas
chisingh May 28, 2025
86daf77
black, isort
chisingh May 28, 2025
49d69f2
fix unit test
chisingh May 28, 2025
2f9059b
human first
chisingh May 29, 2025
e933a51
Merge branch 'main' of https://github.com/microsoft/debug-gym into hi…
chisingh May 29, 2025
3fb8e78
fix tests
chisingh May 29, 2025
5d573d3
Merge branch 'main' into hitl_agent
chisingh May 30, 2025
e10c31b
Mention to install mini_nightmare deps
MarcCote May 28, 2025
0ed066f
Support multiple -p args with run.py
MarcCote May 28, 2025
5981150
Revert mini_nightmare deps, should be installed in the docker container.
MarcCote May 28, 2025
f27caf1
Add option to list available agents and problems
MarcCote May 28, 2025
b59b20d
Typo in config file
MarcCote May 30, 2025
dd32232
Merge branch 'main' into hitl_agent
MarcCote Jun 4, 2025
e5abb91
Move guided agent in its own file.
MarcCote Jun 11, 2025
d1e23ba
Merge remote-tracking branch 'origin/main' into hitl_agent
MarcCote Jun 11, 2025
bc9dcfd
Fix logic and use human_response.tool
MarcCote Jun 12, 2025
5353750
Merge branch 'main' into hitl_agent
MarcCote Jul 15, 2025
f6ddcad
WIP
MarcCote Jul 15, 2025
6e8f179
Fix undefined info variable
MarcCote Jul 17, 2025
ad4b0f1
WIP
MarcCote Jul 18, 2025
7db2d58
Override build_prompt to take into account a any given llm instead of…
MarcCote Jul 18, 2025
6caa919
WIP
MarcCote Jul 18, 2025
0c3fac9
Merge branch 'main' into hitl_agent
MarcCote Jul 18, 2025
6dccab8
WIP
MarcCote Jul 18, 2025
d4f9b1b
Install git in aider docker terminal
matheper Jul 22, 2025
616a59b
remove remaining ipdb
icwhite Jul 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions debug_gym/agents/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent
from debug_gym.agents.guided_agent import GuidedRewriteAgent
from debug_gym.agents.rewrite_agent import RewriteAgent
from debug_gym.agents.solution_agent import AgentSolution
184 changes: 184 additions & 0 deletions debug_gym/agents/guided_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import logging

from debug_gym.agents.base_agent import register_agent
from debug_gym.agents.history_tracker import build_history_prompt
from debug_gym.agents.rewrite_agent import RewriteAgent
from debug_gym.gym.entities import Event
from debug_gym.gym.tools.tool import ToolCall
from debug_gym.gym.tools.toolbox import Toolbox
from debug_gym.llms.base import LLM
from debug_gym.logger import DebugGymLogger


@register_agent
class GuidedRewriteAgent(RewriteAgent):
name: str = "guided_agent"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

# Initialize the different LLM rewriters.
self.llms = [
LLM.instantiate(
llm_name=llm_name,
logger=DebugGymLogger(
name=llm_name,
level=logging.DEBUG,
log_dir=self.logger.log_file.parent,
icon="🤖",
),
)
for llm_name in self.config["llms"]
]

# Create logger for the main guide, e.g. (a human).
self.llm.logger = DebugGymLogger(
name=self.config["llm_name"],
level=logging.DEBUG,
log_dir=self.logger.log_file.parent,
icon="👤",
)

def build_prompt(self, info, llm):
messages = []
messages.extend(self.build_system_prompt(info))
messages.extend(self.build_history_prompt(llm))
messages.extend(self.build_question_prompt())
return messages

def build_history_prompt(self, llm):
messages = build_history_prompt(
self.history,
llm,
self.config["reset_prompt_history_after_rewrite"],
)
return messages

def try_rewrite_and_rollback(self, llm, last_info):
prompt = self.build_prompt(last_info, llm)

# Git commit the current state before trying to rewrite.
self.env.terminal.run("git add . && git commit -m 'Before rewrite attempt'")

# Remove all tools except the rewrite tool.
tools = [tool for tool in last_info.tools if tool.name == "rewrite"]
response = llm(prompt, tools)
llm.logger.info(f"LLM response: {response.response}")
llm.logger.info(f"LLM tool: {response.tool}")

# Temporarily disable the REWRITE_SUCCESS event.
self.env.event_hooks.mute(Event.REWRITE_SUCCESS)
info_after_rewrite = self.env.step(response.tool)
llm_info = self.env.step(ToolCall(id="eval", name="eval", arguments={}))
self.env.event_hooks.unmute(Event.REWRITE_SUCCESS)

llm.logger.info(f"LLM observation: {llm_info.eval_observation.observation}.")

if not llm_info.done:
# Rollback any changes made by the LLM if it hasn't solved the task yet.
self.env.terminal.run("git reset --hard HEAD")

return llm_info

def run(self, task_name=None, debug=False):
step = 0
max_steps = self.config["max_steps"]
info = None
llm_done = False
try:
self.history.reset()
info = self.env.reset(options={"task_name": task_name})
# initial state does not have prompt and response
self.history.step(info, None)

# First make sure git is setup correctly.
self.env.terminal.run(
"git init && git config user.name 'debug-gym' && git config user.email '<>'"
)

if info.done is True:
self.logger.report_progress(
problem_id=task_name,
step=1,
total_steps=1,
score=info.score,
max_score=info.max_score,
status="resolved",
)
return True

highscore = info.score

for step in range(max_steps):
self.logger.info(f"\n{'='*20} STEP {step+1} {'='*20}\n")
highscore = max(highscore, info.score)
self.logger.info(
f"Step: {step} | Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%}) [Best: {highscore}]"
)

solved = None
for llm in self.llms:
llm_info = self.try_rewrite_and_rollback(llm, info)
if llm_info.done:
solved = llm_info
msg = f"[green] ✅ The rewrite-only agent with {llm.model_name} managed to solve the task with the current context. ✅ [/green]"
llm.logger.info(msg)
else:
msg = f"[red] ❌ The rewrite-only agent with {llm.model_name} failed to solve the task with the current context. ❌ [/red]"
llm.logger.info(msg)

if solved is not None:
llm_info = solved
break

# If the LLM did not manage to solve the task, we continue with the guided approach.
prompt = self.build_prompt(info, self.llm)
guide_response = self.llm(prompt, info.tools)

if debug:
breakpoint()

# step the environment with the guide response
info = self.env.step(guide_response.tool)
# log the guide response
self.history.step(info, guide_response)

if info.done:
self.logger.info(
"You managed to provide the patch that solves the task before the LLM. Congrats!"
)
break

# keep progress bar running until max_steps is reached
self.logger.report_progress(
problem_id=task_name,
step=step + 1,
total_steps=max_steps + 1,
score=info.score,
max_score=info.max_score,
status="running",
)

# max_steps was reached, task was either resolved or unresolved
# self.logger.report_progress(
# problem_id=task_name,
# step=step + 1,
# total_steps=step + 1,
# score=info.score,
# max_score=info.max_score,
# status="resolved" if info.done or llm_info.done else "unresolved",
# )

return info.done or llm_info.done
except Exception as e:
# report any error that happens during the run
if info:
self.logger.report_progress(
problem_id=task_name,
step=step + 1,
total_steps=step + 1,
score=info.score if info else 0,
max_score=info.max_score if info else 1,
status="error",
)
raise
10 changes: 8 additions & 2 deletions debug_gym/gym/envs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import logging

from debug_gym.gym.envs.aider import AiderBenchmarkEnv
from debug_gym.gym.envs.env import RepoEnv, TooledEnv
from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv
from debug_gym.gym.envs.swe_bench import SWEBenchEnv
from debug_gym.gym.envs.swe_smith import SWESmithEnv


def select_env(env_type: str = None) -> type[RepoEnv]:
Expand All @@ -12,8 +12,14 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
case "aider":
return AiderBenchmarkEnv
case "swebench":
from debug_gym.gym.envs.swe_bench import SWEBenchEnv

logging.getLogger("httpx").setLevel(logging.WARNING)
return SWEBenchEnv
case "swesmith":
from debug_gym.gym.envs.swe_smith import SWESmithEnv

logging.getLogger("httpx").setLevel(logging.WARNING)
return SWESmithEnv
case "mini_nightmare":
return MiniNightmareEnv
Expand Down
30 changes: 30 additions & 0 deletions debug_gym/gym/envs/aider.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,24 @@
from debug_gym.constants import DEBUG_GYM_CACHE_DIR
from debug_gym.gym.entities import EvalOutput
from debug_gym.gym.envs.env import RepoEnv
from debug_gym.gym.terminal import DockerTerminal, Terminal


class AiderBenchmarkEnv(RepoEnv):
REPO_URL = "https://github.com/exercism/python"
REPO_PATH = DEBUG_GYM_CACHE_DIR / "exercism"

def __init__(
self,
terminal: Terminal | None = None,
**kwargs,
):
terminal = terminal or DockerTerminal(logger=kwargs.get("logger"))
if not isinstance(terminal, DockerTerminal):
raise ValueError("AiderBenchmarkEnv only supports DockerTerminal.")

super().__init__(terminal=terminal, **kwargs)

@property
def instructions(self) -> str:
return self.current_sample["instructions"]
Expand All @@ -31,11 +43,29 @@ def eval(self, **kwargs) -> EvalOutput:
self.last_eval = EvalOutput(success, output)
return self.last_eval

def setup_terminal(self):
self.logger.info(f"Configuring docker container: {self.terminal.container}")

self.terminal.run("git init")
self.terminal.run("git config user.name 'debug-gym'")
self.terminal.run("git config user.email '<>'")

self.terminal.run("git add *.py")
self.terminal.run("git commit -am 'Init'")

self.terminal.run("git add .debugignore")
self.terminal.run("git add .debugreadonly")
self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")

def reset(self, *, options: dict = None):
options = options or {}
self.current_sample = self.dataset[options["task_name"]]
directory = self.current_sample["base_directory"]
self.setup_workspace(directory, entrypoint=self.entrypoint)
from ipdb import set_trace

set_trace()
self.setup_terminal()
infos = super().reset(options=options)
return infos

Expand Down
25 changes: 21 additions & 4 deletions debug_gym/gym/envs/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class EnvInfo:
class EventHooks:
def __init__(self):
self.event_listeners = {event: [] for event in Event}
self.event_listeners_muted = {event: [] for event in Event}

def subscribe(self, event: Event, tool: "Tool"):
if event not in self.event_listeners:
Expand All @@ -50,6 +51,20 @@ def subscribe(self, event: Event, tool: "Tool"):
def unsubscribe(self, event: Event, tool):
self.event_listeners[event].remove(tool)

def mute(self, event: Event):
"""Mute all tools for the given event."""
if event not in self.event_listeners_muted:
raise ValueError(f"Unknown event type: {event}")
self.event_listeners_muted[event] = self.event_listeners[event][:]
self.event_listeners[event] = []

def unmute(self, event: Event):
"""Unmute all tools for the given event."""
if event not in self.event_listeners_muted:
raise ValueError(f"Unknown event type: {event}")
self.event_listeners[event] = self.event_listeners_muted[event][:]
self.event_listeners_muted[event] = []

def notify(
self, environment, event: Event, source=None, **kwargs
) -> list[Observation]:
Expand Down Expand Up @@ -500,10 +515,12 @@ def current_breakpoints(self):

@property
def patch(self):
command = ["git", "diff", "--no-index", self.path, self.working_dir]
result = subprocess.run(command, text=True, capture_output=True)
patch = result.stdout.replace(str(self.working_dir), str(self.path))
return patch
success, output = self.terminal.run("git diff")
if not success:
self.logger.error("Failed to get git diff. {output}")
return None

return output

def apply_gold_patch(self):
raise NotImplementedError(
Expand Down
29 changes: 29 additions & 0 deletions debug_gym/gym/envs/mini_nightmare.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import os
import subprocess
from os.path import join as pjoin

import debug_gym.gym.utils as utils
from debug_gym.gym.entities import EvalOutput
from debug_gym.gym.envs.env import RepoEnv
from debug_gym.gym.terminal import DockerTerminal, Terminal


class MiniNightmareEnv(RepoEnv):
Expand All @@ -21,6 +23,17 @@ class MiniNightmareEnv(RepoEnv):
"tomorrow_date",
]

def __init__(
self,
terminal: Terminal | None = None,
**kwargs,
):
terminal = terminal or DockerTerminal(logger=kwargs.get("logger"))
if not isinstance(terminal, DockerTerminal):
raise ValueError("MiniNightmareEnv only supports DockerTerminal.")

super().__init__(terminal=terminal, **kwargs)

@property
def instructions(self) -> str:
return self.current_sample["instructions"]
Expand All @@ -41,11 +54,27 @@ def eval(self, **kwargs) -> EvalOutput:
self.last_eval = EvalOutput(success, output)
return self.last_eval

def setup_terminal(self):
self.logger.info(f"Configuring {self.terminal.container}...")

self.terminal.run("git init")
self.terminal.run("git config user.name 'debug-gym'")
self.terminal.run("git config user.email '<>'")

self.terminal.run("git add *.py")
self.terminal.run("git commit -am 'Init'")

self.terminal.run("git add .debugignore")
self.terminal.run("git add .debugreadonly")
self.terminal.run("git commit -am 'Add debug-gym ignore and read-only files'")

def reset(self, *, options: dict = None):
options = options or {}
self.current_sample = self.dataset[options["task_name"]]
directory = self.current_sample["base_directory"]
self.setup_workspace(directory, entrypoint=self.entrypoint)
self.setup_terminal()

infos = super().reset(options=options)
return infos

Expand Down
9 changes: 0 additions & 9 deletions debug_gym/gym/envs/swe_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,15 +147,6 @@ def setup_task(self, task_name):
self.test_spec, docker.from_env(), logger=None, nocache=False
)

@property
def patch(self):
command = "git diff"
result = subprocess.run(
command.split(), cwd=self.working_dir, text=True, capture_output=True
)
# patch = result.stdout.replace(str(self.working_dir), str(self.path))
return result.stdout

def apply_gold_patch(self):
self.logger.info(f"Applying gold patch to {self.working_dir}.")
command = self.git_apply_cmd + f" <<'EOF'\n{self.gold_patch}\nEOF"
Expand Down
Loading