Fix logic and use human_response.tool

MarcCote · MarcCote · commit bc9dcfdad77d · 2025-06-12T09:16:25.000-04:00
diff --git a/debug_gym/agents/guided_agent.py b/debug_gym/agents/guided_agent.py
@@ -1,5 +1,7 @@
 import logging
 
+from termcolor import colored
+
 from debug_gym.agents.base_agent import register_agent
 from debug_gym.agents.rewrite_agent import RewriteAgent
 from debug_gym.llms.base import LLM
@@ -12,6 +14,9 @@ class GuidedRewriteAgent(RewriteAgent):
 
     def try_rewrite(self, task_name):
         # make a copy of the env for the llm
+        from ipdb import set_trace
+
+        set_trace()
         cloned_env = self.env.clone()
 
         # Only keep the rewrite tool in the cloned env
@@ -33,7 +38,10 @@ def try_rewrite(self, task_name):
         return info.done
 
     def run(self, task_name=None, debug=False):
-        self.llm.logger = DebugGymLogger(name="LLM", level=logging.ERROR)
+        self.logger.level = logging.DEBUG
+        self.llm.logger = DebugGymLogger(
+            name="LLM", level=logging.ERROR, log_dir=self.logger.log_file.parent
+        )
         self.human = LLM.instantiate(llm_name="human", logger=self.logger)
 
         self.history.reset()
@@ -55,10 +63,12 @@ def run(self, task_name=None, debug=False):
 
             llm_done = self.try_rewrite(task_name)
             if llm_done:
-                self.logger.info(
-                    f"*** The rewrite-only agent with {self.llm.model_name} managed to solve the task with the current context. ***"
-                )
+                msg = f"*** The rewrite-only agent with {self.llm.model_name} managed to solve the task with the current context. ***"
+                self.logger.info(colored(msg, "green"))
                 break
+            else:
+                msg = f"*** The rewrite-only agent with {self.llm.model_name} failed to solve the task with the current context. ***"
+                self.logger.info(colored(msg, "red"))
 
             # If the LLM did not manage to solve the task, we continue with the guided approach.
             prompt = self.build_prompt(info)
@@ -68,7 +78,7 @@ def run(self, task_name=None, debug=False):
                 breakpoint()
 
             # step the environment with the human response
-            info = self.env.step(human_response.response)
+            info = self.env.step(human_response.tool)
             # log the human response
             self.history.step(info, human_response)
 
diff --git a/debug_gym/agents/solution_agent.py b/debug_gym/agents/solution_agent.py
@@ -1,8 +1,4 @@
-import subprocess
-
 from debug_gym.agents.base_agent import BaseAgent, register_agent
-from debug_gym.gym.envs.swe_bench import SWEBenchEnv
-from debug_gym.gym.envs.swe_smith import SWESmithEnv
 from debug_gym.gym.tools.tool import ToolCall
 
 
diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py
@@ -1,8 +1,8 @@
+import logging
+
 from debug_gym.gym.envs.aider import AiderBenchmarkEnv
 from debug_gym.gym.envs.env import RepoEnv, TooledEnv
 from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv
-from debug_gym.gym.envs.swe_bench import SWEBenchEnv
-from debug_gym.gym.envs.swe_smith import SWESmithEnv
 
 
 def select_env(env_type: str = None) -> type[RepoEnv]:
@@ -12,8 +12,14 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
         case "aider":
             return AiderBenchmarkEnv
         case "swebench":
+            from debug_gym.gym.envs.swe_bench import SWEBenchEnv
+
+            logging.getLogger("httpx").setLevel(logging.WARNING)
             return SWEBenchEnv
         case "swesmith":
+            from debug_gym.gym.envs.swe_smith import SWESmithEnv
+
+            logging.getLogger("httpx").setLevel(logging.WARNING)
             return SWESmithEnv
         case "mini_nightmare":
             return MiniNightmareEnv
diff --git a/debug_gym/llms/base.py b/debug_gym/llms/base.py
@@ -1,4 +1,3 @@
-import logging
 import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -20,9 +19,6 @@
 from debug_gym.llms.utils import print_messages
 from debug_gym.logger import DebugGymLogger
 
-# Set logging level down to WARNING for endpoint queries.
-logging.getLogger("httpx").setLevel(logging.WARNING)
-
 
 def retry_on_rate_limit(
     func, is_rate_limit_error_func, multiplier=1, max_wait=40, max_attempts=100
diff --git a/debug_gym/llms/human.py b/debug_gym/llms/human.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import re
 import sys
 from typing import Any, Dict, List, Optional, Tuple
@@ -470,6 +471,12 @@ def __init__(
         if prompt_toolkit_available:
             self._history = InMemoryHistory()
 
+        # Warn if self.logger.level is not set at least to INFO, as this is a human interface.
+        if self.logger.level > logging.INFO:
+            self.logger.warning(
+                "Human Mode should have logger level set to at least INFO (using -v) for better interaction."
+            )
+
     def tokenize(self, text: str) -> list[str]:
         """Tokenizes a text by splitting it by spaces."""
         return text.split()
diff --git a/scripts/run.py b/scripts/run.py
@@ -1,14 +1,17 @@
 import json
+import logging  # Set logging level down to WARNING for endpoint queries.
 import os
 import uuid
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from itertools import groupby
 from pathlib import Path
 
 from termcolor import colored
 from tqdm import tqdm
 
 from debug_gym.agents.base_agent import AGENT_REGISTRY, create_agent
+
+logging.getLogger("httpx").setLevel(logging.WARNING)
+
 from debug_gym.agents.utils import load_config
 from debug_gym.gym.envs import select_env
 from debug_gym.gym.terminal import select_terminal
@@ -104,7 +107,7 @@ def main():
 
     # Figure out which problems to solve.
     problems = config.get("problems", ["custom"])
-    if type(problems) == str and "benchmark" in config:
+    if type(problems) is str and "benchmark" in config:
         env = create_env(config, logger=logger)
         if problems == "all":
             problems = sorted(env.dataset.keys())  # all tasks
diff --git a/tests/agents/test_example_agent.py b/tests/agents/test_example_agent.py
@@ -95,80 +95,3 @@ def test_run_debug_5_agent(agent_setup, build_env_info):
     env.tools = {"pdb": MagicMock()}
     result = agent.run(task_name="test_task", debug=False)
     assert result
-
-
-@patch.object(
-    Human,
-    "__call__",
-    return_value=LLMResponse(
-        "Prompt",
-        '{"id": "pdb-267437", "name": "pdb", "arguments": {"command": "c"}}',
-        TokenUsage(2, 4),
-    ),
-)
-def test_human_in_the_loop(human, agent_setup, build_env_info):
-    agent, env, llm = next(agent_setup(DebugHumanInTheLoop))
-    env.reset.return_value = build_env_info(
-        done=False,
-        score=0,
-        max_score=10,
-        rewrite_counter=0,
-        instructions="Test instructions",
-        dir_tree="Test dir tree",
-        current_breakpoints="Test breakpoints",
-        step_observation="Test last run obs",
-    )
-    env.step.return_value = build_env_info(
-        done=False,
-        score=10,
-        max_score=10,
-        rewrite_counter=0,
-        instructions="Test instructions",
-        dir_tree="Test dir tree",
-        current_breakpoints="Test breakpoints",
-        step_observation="Test last run obs",
-    )
-
-    env.clone.return_value = MagicMock()
-    llm.return_value = LLMResponse("Prompt", "Expected answer", TokenUsage(2, 4))
-    env.tools = {"pdb": MagicMock()}
-
-    env.clone().step.return_value = build_env_info(
-        done=True,
-        score=10,
-        max_score=10,
-        rewrite_counter=0,
-        instructions="Test instructions",
-        dir_tree="Test dir tree",
-        current_breakpoints="Test breakpoints",
-        step_observation="Test last run obs",
-    )
-    result = agent.run(task_name="test_task", debug=False)
-
-    assert result is False
-    # test that llm actions were executed
-    assert env.step.called
-    env.step.assert_called_with(human().response)
-    assert env.step().done is False
-
-    # test that llm actions were logged
-    _history, _prompt_response_pairs = agent.history.get()
-    assert [[], [human()]] == _prompt_response_pairs
-
-    # test that env was cloned
-    assert env.clone.called
-    assert env.clone().reset.called
-
-    # assert that cloned env was called with history steps
-    env.clone().step.assert_has_calls(
-        [
-            call(agent.history.get_all()[0].action),
-        ]
-    )
-
-    # test that human action was executed
-    assert env.clone().step.called
-    env.clone().step.assert_called_with(llm().response)
-
-    # ensure that human action was not recorded in history
-    assert env.clone().step() not in agent.history.get_all()
diff --git a/tests/agents/test_guided_agent.py b/tests/agents/test_guided_agent.py
@@ -0,0 +1,82 @@
+from unittest.mock import MagicMock, patch
+
+from debug_gym.agents import GuidedRewriteAgent
+from debug_gym.llms import Human
+from debug_gym.llms.base import LLMResponse, TokenUsage
+
+
+@patch.object(
+    Human,
+    "__call__",
+    return_value=LLMResponse(
+        "Prompt",
+        '{"id": "pdb-267437", "name": "pdb", "arguments": {"command": "c"}}',
+        TokenUsage(2, 4),
+    ),
+)
+def test_human_in_the_loop(human, agent_setup, build_env_info):
+    agent, env, llm = next(agent_setup(GuidedRewriteAgent))
+    env.reset.return_value = build_env_info(
+        done=False,
+        score=0,
+        max_score=10,
+        rewrite_counter=0,
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+    env.step.return_value = build_env_info(
+        done=False,
+        score=10,
+        max_score=10,
+        rewrite_counter=0,
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+
+    env.clone.return_value = MagicMock()
+    llm.return_value = LLMResponse("Prompt", "Expected answer", TokenUsage(2, 4))
+    env.tools = {"pdb": MagicMock()}
+
+    env.clone().step.return_value = build_env_info(
+        done=True,
+        score=10,
+        max_score=10,
+        rewrite_counter=0,
+        instructions="Test instructions",
+        dir_tree="Test dir tree",
+        current_breakpoints="Test breakpoints",
+        step_observation="Test last run obs",
+    )
+    result = agent.run(task_name="test_task", debug=False)
+
+    assert result is False
+    # test that llm actions were executed
+    assert env.step.called
+    env.step.assert_called_with(human().response)
+    assert env.step().done is False
+
+    # test that llm actions were logged
+    _history, _prompt_response_pairs = agent.history.get()
+    assert [[], [human()]] == _prompt_response_pairs
+
+    # test that env was cloned
+    assert env.clone.called
+    assert env.clone().reset.called
+
+    # assert that cloned env was called with history steps
+    env.clone().step.assert_has_calls(
+        [
+            call(agent.history.get_all()[0].action),
+        ]
+    )
+
+    # test that human action was executed
+    assert env.clone().step.called
+    env.clone().step.assert_called_with(llm().response)
+
+    # ensure that human action was not recorded in history
+    assert env.clone().step() not in agent.history.get_all()