Fix logic and use human_response.tool

MarcCote · MarcCote · commit fa9d13638051 · 2025-06-12T09:09:36.000-04:00
diff --git a/debug_gym/agents/guided_agent.py b/debug_gym/agents/guided_agent.py
@@ -1,5 +1,7 @@
 import logging
 
+from termcolor import colored
+
 from debug_gym.agents.base_agent import register_agent
 from debug_gym.agents.rewrite_agent import RewriteAgent
 from debug_gym.llms.base import LLM
@@ -12,6 +14,9 @@ class GuidedRewriteAgent(RewriteAgent):
 
     def try_rewrite(self, task_name):
         # make a copy of the env for the llm
+        from ipdb import set_trace
+
+        set_trace()
         cloned_env = self.env.clone()
 
         # Only keep the rewrite tool in the cloned env
@@ -33,7 +38,10 @@ def try_rewrite(self, task_name):
         return info.done
 
     def run(self, task_name=None, debug=False):
-        self.llm.logger = DebugGymLogger(name="LLM", level=logging.ERROR)
+        self.logger.level = logging.DEBUG
+        self.llm.logger = DebugGymLogger(
+            name="LLM", level=logging.ERROR, log_dir=self.logger.log_file.parent
+        )
         self.human = LLM.instantiate(llm_name="human", logger=self.logger)
 
         self.history.reset()
@@ -55,10 +63,12 @@ def run(self, task_name=None, debug=False):
 
             llm_done = self.try_rewrite(task_name)
             if llm_done:
-                self.logger.info(
-                    f"*** The rewrite-only agent with {self.llm.model_name} managed to solve the task with the current context. ***"
-                )
+                msg = f"*** The rewrite-only agent with {self.llm.model_name} managed to solve the task with the current context. ***"
+                self.logger.info(colored(msg, "green"))
                 break
+            else:
+                msg = f"*** The rewrite-only agent with {self.llm.model_name} failed to solve the task with the current context. ***"
+                self.logger.info(colored(msg, "red"))
 
             # If the LLM did not manage to solve the task, we continue with the guided approach.
             prompt = self.build_prompt(info)
@@ -68,7 +78,7 @@ def run(self, task_name=None, debug=False):
                 breakpoint()
 
             # step the environment with the human response
-            info = self.env.step(human_response.response)
+            info = self.env.step(human_response.tool)
             # log the human response
             self.history.step(info, human_response)
 
diff --git a/debug_gym/agents/solution_agent.py b/debug_gym/agents/solution_agent.py
@@ -1,8 +1,4 @@
-import subprocess
-
 from debug_gym.agents.base_agent import BaseAgent, register_agent
-from debug_gym.gym.envs.swe_bench import SWEBenchEnv
-from debug_gym.gym.envs.swe_smith import SWESmithEnv
 from debug_gym.gym.tools.tool import ToolCall
 
 
diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py
@@ -1,8 +1,8 @@
+import logging
+
 from debug_gym.gym.envs.aider import AiderBenchmarkEnv
 from debug_gym.gym.envs.env import RepoEnv, TooledEnv
 from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv
-from debug_gym.gym.envs.swe_bench import SWEBenchEnv
-from debug_gym.gym.envs.swe_smith import SWESmithEnv
 
 
 def select_env(env_type: str = None) -> type[RepoEnv]:
@@ -12,8 +12,14 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
         case "aider":
             return AiderBenchmarkEnv
         case "swebench":
+            from debug_gym.gym.envs.swe_bench import SWEBenchEnv
+
+            logging.getLogger("httpx").setLevel(logging.WARNING)
             return SWEBenchEnv
         case "swesmith":
+            from debug_gym.gym.envs.swe_smith import SWESmithEnv
+
+            logging.getLogger("httpx").setLevel(logging.WARNING)
             return SWESmithEnv
         case "mini_nightmare":
             return MiniNightmareEnv
diff --git a/debug_gym/llms/base.py b/debug_gym/llms/base.py
@@ -1,4 +1,3 @@
-import logging
 import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -20,9 +19,6 @@
 from debug_gym.llms.utils import print_messages
 from debug_gym.logger import DebugGymLogger
 
-# Set logging level down to WARNING for endpoint queries.
-logging.getLogger("httpx").setLevel(logging.WARNING)
-
 
 def retry_on_rate_limit(
     func, is_rate_limit_error_func, multiplier=1, max_wait=40, max_attempts=100
diff --git a/debug_gym/llms/human.py b/debug_gym/llms/human.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import re
 import sys
 from typing import Any, Dict, List, Optional, Tuple
@@ -470,6 +471,12 @@ def __init__(
         if prompt_toolkit_available:
             self._history = InMemoryHistory()
 
+        # Warn if self.logger.level is not set at least to INFO, as this is a human interface.
+        if self.logger.level > logging.INFO:
+            self.logger.warning(
+                "Human Mode should have logger level set to at least INFO (using -v) for better interaction."
+            )
+
     def tokenize(self, text: str) -> list[str]:
         """Tokenizes a text by splitting it by spaces."""
         return text.split()
diff --git a/scripts/run.py b/scripts/run.py
@@ -1,14 +1,17 @@
 import json
+import logging  # Set logging level down to WARNING for endpoint queries.
 import os
 import uuid
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from itertools import groupby
 from pathlib import Path
 
 from termcolor import colored
 from tqdm import tqdm
 
 from debug_gym.agents.base_agent import AGENT_REGISTRY, create_agent
+
+logging.getLogger("httpx").setLevel(logging.WARNING)
+
 from debug_gym.agents.utils import load_config
 from debug_gym.gym.envs import select_env
 from debug_gym.gym.terminal import select_terminal
@@ -104,7 +107,7 @@ def main():
 
     # Figure out which problems to solve.
     problems = config.get("problems", ["custom"])
-    if type(problems) == str and "benchmark" in config:
+    if type(problems) is str and "benchmark" in config:
         env = create_env(config, logger=logger)
         if problems == "all":
             problems = sorted(env.dataset.keys())  # all tasks
diff --git a/tests/agents/test_example_agent.py b/tests/agents/test_example_agent.py
@@ -0,0 +1,97 @@
+from unittest.mock import MagicMock
+
+from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent
+from debug_gym.agents.rewrite_agent import RewriteAgent
+from debug_gym.llms.base import LLMResponse, TokenUsage
+
+
+def test_build_question_prompt(agent_setup):
+    agent, _, _ = next(agent_setup(DebugAgent))
+    messages = agent.build_question_prompt()
+    assert len(messages) == 1
+    assert "continue your debugging" in messages[0]["content"]
+
+
+def test_build_prompt(agent_setup, build_env_info):
+    agent, _, _ = next(agent_setup(DebugAgent))
+    info = build_env_info(
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+    messages = agent.build_prompt(info)
+    assert len(messages) > 0
+
+
+def test_run(agent_setup, build_env_info):
+    agent, env, llm = next(agent_setup(DebugAgent))
+    env.reset.return_value = build_env_info(
+        done=False,
+        score=0,
+        max_score=10,
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+    env.step.return_value = build_env_info(
+        done=True,
+        score=10,
+        max_score=10,
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+    llm.return_value = LLMResponse("Prompt", "Expected answer", TokenUsage(2, 4))
+    result = agent.run(task_name="test_task", debug=False)
+    assert result
+
+
+def test_build_system_prompt_rewrite_agent(agent_setup, build_env_info):
+    agent, _, _ = next(agent_setup(RewriteAgent))
+    info = build_env_info(
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+    messages = agent.build_system_prompt(info)
+    assert len(messages) == 1
+    assert "Overall task" in messages[0]["content"]
+
+
+def test_build_question_prompt_rewrite_agent(agent_setup):
+    agent, _, _ = next(agent_setup(RewriteAgent))
+    messages = agent.build_question_prompt()
+    assert len(messages) == 1
+    assert "continue your debugging" in messages[0]["content"]
+
+
+def test_run_debug_5_agent(agent_setup, build_env_info):
+    agent, env, llm = next(agent_setup(Debug_5_Agent))
+    env.reset.return_value = build_env_info(
+        done=False,
+        score=0,
+        max_score=10,
+        rewrite_counter=0,
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+    env.step.return_value = build_env_info(
+        done=True,
+        score=10,
+        max_score=10,
+        rewrite_counter=0,
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+    llm.return_value = LLMResponse("Prompt", "Expected answer", TokenUsage(2, 4))
+    env.tools = {"pdb": MagicMock()}
+    result = agent.run(task_name="test_task", debug=False)
+    assert result
diff --git a/tests/agents/test_pdb_agent.py b/tests/agents/test_pdb_agent.py
@@ -1,4 +1,4 @@
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 
 from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent
 from debug_gym.agents.rewrite_agent import RewriteAgent

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from unittest.mock import MagicMock`
	`1`	`+from unittest.mock import MagicMock, patch`
`2`	`2`
`3`	`3`	`from debug_gym.agents.debug_agent import Debug_5_Agent, DebugAgent`
`4`	`4`	`from debug_gym.agents.rewrite_agent import RewriteAgent`