|
| 1 | +from unittest.mock import MagicMock, patch |
| 2 | + |
| 3 | +from debug_gym.agents import GuidedRewriteAgent |
| 4 | +from debug_gym.llms import Human |
| 5 | +from debug_gym.llms.base import LLMResponse, TokenUsage |
| 6 | + |
| 7 | + |
| 8 | +@patch.object( |
| 9 | + Human, |
| 10 | + "__call__", |
| 11 | + return_value=LLMResponse( |
| 12 | + "Prompt", |
| 13 | + '{"id": "pdb-267437", "name": "pdb", "arguments": {"command": "c"}}', |
| 14 | + TokenUsage(2, 4), |
| 15 | + ), |
| 16 | +) |
| 17 | +def test_human_in_the_loop(human, agent_setup, build_env_info): |
| 18 | + agent, env, llm = next(agent_setup(GuidedRewriteAgent)) |
| 19 | + env.reset.return_value = build_env_info( |
| 20 | + done=False, |
| 21 | + score=0, |
| 22 | + max_score=10, |
| 23 | + rewrite_counter=0, |
| 24 | + instructions="Test instructions", |
| 25 | + dir_tree="Test dir tree", |
| 26 | + current_breakpoints="Test breakpoints", |
| 27 | + step_observation="Test last run obs", |
| 28 | + ) |
| 29 | + env.step.return_value = build_env_info( |
| 30 | + done=False, |
| 31 | + score=10, |
| 32 | + max_score=10, |
| 33 | + rewrite_counter=0, |
| 34 | + instructions="Test instructions", |
| 35 | + dir_tree="Test dir tree", |
| 36 | + current_breakpoints="Test breakpoints", |
| 37 | + step_observation="Test last run obs", |
| 38 | + ) |
| 39 | + |
| 40 | + env.clone.return_value = MagicMock() |
| 41 | + llm.return_value = LLMResponse("Prompt", "Expected answer", TokenUsage(2, 4)) |
| 42 | + env.tools = {"pdb": MagicMock()} |
| 43 | + |
| 44 | + env.clone().step.return_value = build_env_info( |
| 45 | + done=True, |
| 46 | + score=10, |
| 47 | + max_score=10, |
| 48 | + rewrite_counter=0, |
| 49 | + instructions="Test instructions", |
| 50 | + dir_tree="Test dir tree", |
| 51 | + current_breakpoints="Test breakpoints", |
| 52 | + step_observation="Test last run obs", |
| 53 | + ) |
| 54 | + result = agent.run(task_name="test_task", debug=False) |
| 55 | + |
| 56 | + assert result is False |
| 57 | + # test that llm actions were executed |
| 58 | + assert env.step.called |
| 59 | + env.step.assert_called_with(human().response) |
| 60 | + assert env.step().done is False |
| 61 | + |
| 62 | + # test that llm actions were logged |
| 63 | + _history, _prompt_response_pairs = agent.history.get() |
| 64 | + assert [[], [human()]] == _prompt_response_pairs |
| 65 | + |
| 66 | + # test that env was cloned |
| 67 | + assert env.clone.called |
| 68 | + assert env.clone().reset.called |
| 69 | + |
| 70 | + # assert that cloned env was called with history steps |
| 71 | + env.clone().step.assert_has_calls( |
| 72 | + [ |
| 73 | + call(agent.history.get_all()[0].action), |
| 74 | + ] |
| 75 | + ) |
| 76 | + |
| 77 | + # test that human action was executed |
| 78 | + assert env.clone().step.called |
| 79 | + env.clone().step.assert_called_with(llm().response) |
| 80 | + |
| 81 | + # ensure that human action was not recorded in history |
| 82 | + assert env.clone().step() not in agent.history.get_all() |
0 commit comments