Fix logic and use human_response.tool

MarcCote · MarcCote · commit 7e3d654b7154 · 2025-06-12T08:58:47.000-04:00
diff --git a/debug_gym/agents/guided_agent.py b/debug_gym/agents/guided_agent.py
@@ -1,5 +1,7 @@
 import logging
 
+from termcolor import colored
+
 from debug_gym.agents.base_agent import register_agent
 from debug_gym.agents.rewrite_agent import RewriteAgent
 from debug_gym.llms.base import LLM
@@ -33,7 +35,10 @@ def try_rewrite(self, task_name):
         return info.done
 
     def run(self, task_name=None, debug=False):
-        self.llm.logger = DebugGymLogger(name="LLM", level=logging.ERROR)
+        self.logger.level = logging.DEBUG
+        self.llm.logger = DebugGymLogger(
+            name="LLM", level=logging.ERROR, log_dir=self.logger.log_file.parent
+        )
         self.human = LLM.instantiate(llm_name="human", logger=self.logger)
 
         self.history.reset()
@@ -55,10 +60,12 @@ def run(self, task_name=None, debug=False):
 
             llm_done = self.try_rewrite(task_name)
             if llm_done:
-                self.logger.info(
-                    f"*** The rewrite-only agent with {self.llm.model_name} managed to solve the task with the current context. ***"
-                )
+                msg = f"*** The rewrite-only agent with {self.llm.model_name} managed to solve the task with the current context. ***"
+                self.logger.info(colored(msg, "green"))
                 break
+            else:
+                msg = f"*** The rewrite-only agent with {self.llm.model_name} failed to solve the task with the current context. ***"
+                self.logger.info(colored(msg, "red"))
 
             # If the LLM did not manage to solve the task, we continue with the guided approach.
             prompt = self.build_prompt(info)
@@ -68,7 +75,7 @@ def run(self, task_name=None, debug=False):
                 breakpoint()
 
             # step the environment with the human response
-            info = self.env.step(human_response.response)
+            info = self.env.step(human_response.tool)
             # log the human response
             self.history.step(info, human_response)
 
diff --git a/debug_gym/agents/solution_agent.py b/debug_gym/agents/solution_agent.py
@@ -1,8 +1,4 @@
-import subprocess
-
 from debug_gym.agents.base_agent import BaseAgent, register_agent
-from debug_gym.gym.envs.swe_bench import SWEBenchEnv
-from debug_gym.gym.envs.swe_smith import SWESmithEnv
 from debug_gym.gym.tools.tool import ToolCall
 
 
diff --git a/debug_gym/gym/envs/__init__.py b/debug_gym/gym/envs/__init__.py
@@ -1,8 +1,8 @@
+import logging
+
 from debug_gym.gym.envs.aider import AiderBenchmarkEnv
 from debug_gym.gym.envs.env import RepoEnv, TooledEnv
 from debug_gym.gym.envs.mini_nightmare import MiniNightmareEnv
-from debug_gym.gym.envs.swe_bench import SWEBenchEnv
-from debug_gym.gym.envs.swe_smith import SWESmithEnv
 
 
 def select_env(env_type: str = None) -> type[RepoEnv]:
@@ -12,8 +12,14 @@ def select_env(env_type: str = None) -> type[RepoEnv]:
         case "aider":
             return AiderBenchmarkEnv
         case "swebench":
+            from debug_gym.gym.envs.swe_bench import SWEBenchEnv
+
+            logging.getLogger("httpx").setLevel(logging.WARNING)
             return SWEBenchEnv
         case "swesmith":
+            from debug_gym.gym.envs.swe_smith import SWESmithEnv
+
+            logging.getLogger("httpx").setLevel(logging.WARNING)
             return SWESmithEnv
         case "mini_nightmare":
             return MiniNightmareEnv
diff --git a/debug_gym/llms/base.py b/debug_gym/llms/base.py
@@ -1,4 +1,3 @@
-import logging
 import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -20,9 +19,6 @@
 from debug_gym.llms.utils import print_messages
 from debug_gym.logger import DebugGymLogger
 
-# Set logging level down to WARNING for endpoint queries.
-logging.getLogger("httpx").setLevel(logging.WARNING)
-
 
 def retry_on_rate_limit(
     func, is_rate_limit_error_func, multiplier=1, max_wait=40, max_attempts=100
diff --git a/debug_gym/llms/human.py b/debug_gym/llms/human.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import re
 import sys
 from typing import Any, Dict, List, Optional, Tuple
@@ -470,6 +471,12 @@ def __init__(
         if prompt_toolkit_available:
             self._history = InMemoryHistory()
 
+        # Warn if self.logger.level is not set at least to INFO, as this is a human interface.
+        if self.logger.level > logging.INFO:
+            self.logger.warning(
+                "Human Mode should have logger level set to at least INFO (using -v) for better interaction."
+            )
+
     def tokenize(self, text: str) -> list[str]:
         """Tokenizes a text by splitting it by spaces."""
         return text.split()
diff --git a/scripts/run.py b/scripts/run.py
@@ -1,14 +1,17 @@
 import json
+import logging  # Set logging level down to WARNING for endpoint queries.
 import os
 import uuid
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from itertools import groupby
 from pathlib import Path
 
 from termcolor import colored
 from tqdm import tqdm
 
 from debug_gym.agents.base_agent import AGENT_REGISTRY, create_agent
+
+logging.getLogger("httpx").setLevel(logging.WARNING)
+
 from debug_gym.agents.utils import load_config
 from debug_gym.gym.envs import select_env
 from debug_gym.gym.terminal import select_terminal
@@ -104,7 +107,7 @@ def main():
 
     # Figure out which problems to solve.
     problems = config.get("problems", ["custom"])
-    if type(problems) == str and "benchmark" in config:
+    if type(problems) is str and "benchmark" in config:
         env = create_env(config, logger=logger)
         if problems == "all":
             problems = sorted(env.dataset.keys())  # all tasks