Merge branch 'error-analysis' of github.com:ServiceNow/AgentLab into error-analysis

TLSDC · TLSDC · commit ec1395bdf19b · 2025-02-20T16:32:00.000-05:00
diff --git a/.github/workflows/code_format.yml b/.github/workflows/code_format.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: '3.10'
+          python-version: '3.11'
           cache: 'pip' # caching pip dependencies
 
       - name: Pip install
@@ -31,4 +31,4 @@ jobs:
         run: pip list
 
       - name: Code Formatting
-        run: black . --check
+        run: black . --check --diff
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ AgentLab Features:
 
 ## 🛠️ Setup AgentLab
 
-AgentLab requires python 3.11 or higher.
+AgentLab requires python 3.11 or 3.12.
 
 ```bash
 pip install agentlab
diff --git a/add_study_to_repro_journal.py b/add_study_to_repro_journal.py
@@ -0,0 +1,18 @@
+import os
+from pathlib import Path
+from agentlab.experiments.study import Study
+
+
+base_dir = "/home/toolkit/ui_copilot_results"
+
+exp_paths = [
+    "2025-01-31_22-08-34_genericagent-o3-mini-2025-01-31-on-workarena-l1",
+    #  '2025-02-02_01-53-45_genericagent-openai-o1-mini-2024-09-12-on-workarena-l1',
+    "2025-02-02_01-55-04_genericagent-openai-o1-mini-2024-09-12-on-workarena-l1",
+]
+full_paths = [os.path.join(base_dir, exp_path) for exp_path in exp_paths]
+
+for full_path in full_paths:
+    study = Study.load(Path(full_path))
+
+    study.append_to_journal(strict_reproducibility=False)
diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
@@ -64,4 +64,12 @@ ThibaultLSDC,GenericAgent-gpt-4o-mini_vision,visualwebarena,0.13.3,2024-12-02_02
 ThibaultLSDC,GenericAgent-gpt-4o_vision,visualwebarena,0.13.3,2024-12-02_07-17-28,7fb7eac8-4bbd-4ebe-be32-15901a7678f2,0.267,0.015,65,910/910,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
 ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta_vision,visualwebarena,0.13.3,2024-12-02_09-11-35,22f0611d-aeea-4ee9-a533-b45442b5e080,0.21,0.013,178,910/910,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
 ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,webarena,0.13.3,2024-12-02_23-18-38,fc5747bc-d998-4942-a0eb-e55a3ccc1cb3,0.184,0.014,213,811/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
-
+Leo Boisvert,GenericAgent-o3-mini-2025-01-31,workarena_l1,0.4.1,2025-01-31_22-08-33,a74cc00f-f743-43a1-9cab-59af8bffa3a2,0.482,0.028,3,330/330,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.3,1.44.0,v0.3.2,73baabee6d7ac37a5b8677c80baf83914a4f4dc4,"  M: src/agentlab/agents/generic_agent/__init__.py
+  M: src/agentlab/agents/generic_agent/agent_configs.py
+  M: src/agentlab/analyze/agent_xray.py
+  M: src/agentlab/llm/chat_api.py
+  M: src/agentlab/llm/llm_configs.py",0.13.3,1d2d7160e5b7ec9954ecb48988f71eb56288dd29,"
+Leo Boisvert,GenericAgent-openai_o1-mini-2024-09-12,workarena_l1,0.4.1,2025-02-02_01-55-04,f3e1fcb8-5fc5-4115-9e00-27251508e2c7,0.518,0.028,5,330/330,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.3,1.44.0,v0.3.2,73baabee6d7ac37a5b8677c80baf83914a4f4dc4,"  M: src/agentlab/agents/generic_agent/__init__.py
+  M: src/agentlab/agents/generic_agent/agent_configs.py
+  M: src/agentlab/analyze/agent_xray.py
+  M: src/agentlab/llm/llm_configs.py",0.13.3,1d2d7160e5b7ec9954ecb48988f71eb56288dd29,"
diff --git a/src/agentlab/__init__.py b/src/agentlab/__init__.py
@@ -1 +1 @@
-__version__ = "v0.3.2"
+__version__ = "v0.4.0"
diff --git a/src/agentlab/agents/generic_agent/__init__.py b/src/agentlab/agents/generic_agent/__init__.py
@@ -15,17 +15,26 @@
     RANDOM_SEARCH_AGENT,
     AGENT_4o,
     AGENT_4o_MINI,
+    AGENT_CLAUDE_SONNET_35,
     AGENT_4o_VISION,
+    AGENT_o3_MINI,
+    AGENT_o1_MINI,
 )
 
 __all__ = [
     "AGENT_3_5",
     "AGENT_4o",
     "AGENT_4o_MINI",
     "AGENT_4o_VISION",
+    "AGENT_o3_MINI",
+    "AGENT_o1_MINI",
     "AGENT_LLAMA3_70B",
     "AGENT_LLAMA31_70B",
     "AGENT_8B",
     "RANDOM_SEARCH_AGENT",
     "AGENT_CUSTOM",
+    "AGENT_CLAUDE_SONNET_35",
+    "AGENT_4o_VISION",
+    "AGENT_4o_MINI_VISION",
+    "AGENT_CLAUDE_SONNET_35_VISION",
 ]
diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py
@@ -260,7 +260,20 @@
     chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"],
     flags=FLAGS_GPT_4o,
 )
+AGENT_CLAUDE_SONNET_35 = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/anthropic/claude-3.5-sonnet:beta"],
+    flags=FLAGS_GPT_4o,
+)
 
+AGENT_o3_MINI = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["openai/o3-mini-2025-01-31"],
+    flags=FLAGS_GPT_4o,
+)
+
+AGENT_o1_MINI = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/openai/o1-mini-2024-09-12"],
+    flags=FLAGS_GPT_4o,
+)
 # GPT-4o vision default config
 FLAGS_GPT_4o_VISION = FLAGS_GPT_4o.copy()
 FLAGS_GPT_4o_VISION.obs.use_screenshot = True
@@ -271,6 +284,16 @@
     flags=FLAGS_GPT_4o_VISION,
 )
 
+AGENT_4o_MINI_VISION = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"],
+    flags=FLAGS_GPT_4o_VISION,
+)
+
+AGENT_CLAUDE_SONNET_35_VISION = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/anthropic/claude-3.5-sonnet:beta"],
+    flags=FLAGS_GPT_4o_VISION,
+)
+
 
 DEFAULT_RS_FLAGS = GenericPromptFlags(
     flag_group="default_rs",
diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py
@@ -10,19 +10,19 @@
 
 from copy import deepcopy
 from dataclasses import asdict, dataclass
-from functools import partial
 from warnings import warn
 
 import bgym
 from browsergym.experiments.agent import Agent, AgentInfo
 
 from agentlab.agents import dynamic_prompting as dp
 from agentlab.agents.agent_args import AgentArgs
-from agentlab.llm.chat_api import BaseModelArgs, make_system_message, make_user_message
+from agentlab.llm.chat_api import BaseModelArgs
 from agentlab.llm.llm_utils import Discussion, ParseError, SystemMessage, retry
 from agentlab.llm.tracking import cost_tracker_decorator
 
 from .generic_agent_prompt import GenericPromptFlags, MainPrompt
+from functools import partial
 
 
 @dataclass
@@ -200,82 +200,3 @@ def _get_maxes(self):
             else 20  # dangerous to change the default value here?
         )
         return max_prompt_tokens, max_trunc_itr
-
-
-from functools import partial
-
-
-def get_action_post_hoc(agent: GenericAgent, obs: dict, ans_dict: dict):
-    """
-    Get the action post-hoc for the agent.
-
-    This function is used to get the action after the agent has already been run.
-    Its goal is to recreate the prompt and the output of the agent a posteriori.
-    The purpose is to build datasets for training the agents.
-
-    Args:
-        agent (GenericAgent): The agent for which the action is being determined.
-        obs (dict): The observation dictionary to append to the agent's history.
-        ans_dict (dict): The answer dictionary containing the plan, step, memory, think, and action.
-
-    Returns:
-        Tuple[str, str]: The complete prompt used for the agent and the reconstructed output based on the answer dictionary.
-    """
-    system_prompt = dp.SystemPrompt().prompt
-
-    agent.obs_history.append(obs)
-
-    main_prompt = MainPrompt(
-        action_set=agent.action_set,
-        obs_history=agent.obs_history,
-        actions=agent.actions,
-        memories=agent.memories,
-        thoughts=agent.thoughts,
-        previous_plan=agent.plan,
-        step=agent.plan_step,
-        flags=agent.flags,
-    )
-
-    max_prompt_tokens, max_trunc_itr = agent._get_maxes()
-
-    fit_function = partial(
-        dp.fit_tokens,
-        max_prompt_tokens=max_prompt_tokens,
-        model_name=agent.chat_model_args.model_name,
-        max_iterations=max_trunc_itr,
-    )
-
-    instruction_prompt = fit_function(shrinkable=main_prompt)
-
-    if isinstance(instruction_prompt, list):
-        # NOTE: this is when we have images
-        instruction_prompt = instruction_prompt[0]["text"]
-
-    # TODO: make sure the bid is in the prompt
-
-    output = ""
-
-    # TODO: validate this
-    agent.plan = ans_dict.get("plan", agent.plan)
-    if agent.plan != "No plan yet":
-        output += f"\n<plan>\n{agent.plan}\n</plan>\n"
-
-    # TODO: is plan_step something that the agent's outputs?
-    agent.plan_step = ans_dict.get("step", agent.plan_step)
-
-    memory = ans_dict.get("memory", None)
-    agent.memories.append(memory)
-    if memory is not None:
-        output += f"\n<memory>\n{memory}\n</memory>\n"
-
-    thought = ans_dict.get("think", None)
-    agent.thoughts.append(thought)
-    if thought is not None:
-        output += f"\n<think>\n{thought}\n</think>\n"
-
-    action = ans_dict["action"]
-    agent.actions.append(action)
-    if action is not None:
-        output += f"\n<action>\n{action}\n</action>"
-
-    return system_prompt, instruction_prompt, output
diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py
@@ -5,7 +5,7 @@
 This module contains the classes and functions to reproduce the results of a
 study. It is used to create a new study that will run the same experiments as
 the original study, but with a reproducibility agent that will mimic the same
-answers as the original agent. 
+answers as the original agent.
 
 Stats are collected to compare the original agent's answers with the new agent's
 answers. Load the this reproducibility study in agent-xray to compare the results.
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
@@ -143,6 +143,13 @@ def make_model(self):
                 max_new_tokens=self.max_new_tokens,
                 n_retry_server=self.n_retry_server,
             )
+        elif self.backend == "vllm":
+            return VLLMChatModel(
+                model_name=self.model_name,
+                temperature=self.temperature,
+                max_tokens=self.max_new_tokens,
+                n_retry_server=self.n_retry_server,
+            )
         else:
             raise ValueError(f"Backend {self.backend} is not supported")
 
@@ -429,3 +436,27 @@ def __init__(
 
         client = InferenceClient(model=model_url, token=token)
         self.llm = partial(client.text_generation, max_new_tokens=max_new_tokens)
+
+
+class VLLMChatModel(ChatModel):
+    def __init__(
+        self,
+        model_name,
+        api_key=None,
+        temperature=0.5,
+        max_tokens=100,
+        n_retry_server=4,
+        min_retry_wait_time=60,
+    ):
+        super().__init__(
+            model_name=model_name,
+            api_key=api_key,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            max_retry=n_retry_server,
+            min_retry_wait_time=min_retry_wait_time,
+            api_key_env_var="VLLM_API_KEY",
+            client_class=OpenAI,
+            client_args={"base_url": "http://0.0.0.0:8000/v1"},
+            pricing_func=None,
+        )
diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
@@ -16,7 +16,14 @@
     "test",
 ]
 
-CHAT_MODEL_ARGS_DICT = {  # type: dict[str, Union[AzureModelArgs, OpenAIModelArgs, SelfHostedModelArgs, OpenRouterModelArgs]]
+CHAT_MODEL_ARGS_DICT = {
+    "openai/o3-mini-2025-01-31": OpenAIModelArgs(
+        model_name="o3-mini-2025-01-31",
+        max_total_tokens=200_000,
+        max_input_tokens=200_000,
+        max_new_tokens=100_000,
+        vision_support=False,
+    ),
     "openai/gpt-4o-mini-2024-07-18": OpenAIModelArgs(
         model_name="gpt-4o-mini-2024-07-18",
         max_total_tokens=128_000,
@@ -56,6 +63,13 @@
         max_input_tokens=16_384,
         max_new_tokens=4096,
     ),
+    "openai/o1-mini": OpenAIModelArgs(
+        model_name="openai/o1-mini",
+        max_total_tokens=128_000,
+        max_input_tokens=128_000,
+        max_new_tokens=64_000,
+        temperature=1e-1,
+    ),
     "azure/gpt-35-turbo/gpt-35-turbo": AzureModelArgs(
         model_name="gpt-35-turbo",
         deployment_name="gpt-35-turbo",
@@ -113,6 +127,13 @@
         **default_oss_llms_args,
     ),
     # ---------------- OPENROUTER ----------------#
+    "openrouter/deepseek/deepseek-r1": OpenRouterModelArgs(
+        model_name="deepseek/deepseek-r1",
+        max_total_tokens=128_000,
+        max_input_tokens=100_000,
+        max_new_tokens=128_000,
+        temperature=1e-1,
+    ),
     "openrouter/meta-llama/llama-3.1-405b-instruct": OpenRouterModelArgs(
         model_name="meta-llama/llama-3.1-405b-instruct",
         max_total_tokens=128_000,

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "v0.3.2"`
	`1`	`+__version__ = "v0.4.0"`