Skip to content

Commit ec1395b

Browse files
committed
Merge branch 'error-analysis' of github.com:ServiceNow/AgentLab into error-analysis
2 parents 0972130 + c7b1c5a commit ec1395b

File tree

11 files changed

+119
-88
lines changed

11 files changed

+119
-88
lines changed

.github/workflows/code_format.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
- name: Set up Python
2222
uses: actions/setup-python@v5
2323
with:
24-
python-version: '3.10'
24+
python-version: '3.11'
2525
cache: 'pip' # caching pip dependencies
2626

2727
- name: Pip install
@@ -31,4 +31,4 @@ jobs:
3131
run: pip list
3232

3333
- name: Code Formatting
34-
run: black . --check
34+
run: black . --check --diff

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ AgentLab Features:
6565

6666
## 🛠️ Setup AgentLab
6767

68-
AgentLab requires python 3.11 or higher.
68+
AgentLab requires python 3.11 or 3.12.
6969

7070
```bash
7171
pip install agentlab

add_study_to_repro_journal.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import os
2+
from pathlib import Path
3+
from agentlab.experiments.study import Study
4+
5+
6+
base_dir = "/home/toolkit/ui_copilot_results"
7+
8+
exp_paths = [
9+
"2025-01-31_22-08-34_genericagent-o3-mini-2025-01-31-on-workarena-l1",
10+
# '2025-02-02_01-53-45_genericagent-openai-o1-mini-2024-09-12-on-workarena-l1',
11+
"2025-02-02_01-55-04_genericagent-openai-o1-mini-2024-09-12-on-workarena-l1",
12+
]
13+
full_paths = [os.path.join(base_dir, exp_path) for exp_path in exp_paths]
14+
15+
for full_path in full_paths:
16+
study = Study.load(Path(full_path))
17+
18+
study.append_to_journal(strict_reproducibility=False)

reproducibility_journal.csv

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,12 @@ ThibaultLSDC,GenericAgent-gpt-4o-mini_vision,visualwebarena,0.13.3,2024-12-02_02
6464
ThibaultLSDC,GenericAgent-gpt-4o_vision,visualwebarena,0.13.3,2024-12-02_07-17-28,7fb7eac8-4bbd-4ebe-be32-15901a7678f2,0.267,0.015,65,910/910,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
6565
ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta_vision,visualwebarena,0.13.3,2024-12-02_09-11-35,22f0611d-aeea-4ee9-a533-b45442b5e080,0.21,0.013,178,910/910,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
6666
ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,webarena,0.13.3,2024-12-02_23-18-38,fc5747bc-d998-4942-a0eb-e55a3ccc1cb3,0.184,0.014,213,811/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,
67-
67+
Leo Boisvert,GenericAgent-o3-mini-2025-01-31,workarena_l1,0.4.1,2025-01-31_22-08-33,a74cc00f-f743-43a1-9cab-59af8bffa3a2,0.482,0.028,3,330/330,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.3,1.44.0,v0.3.2,73baabee6d7ac37a5b8677c80baf83914a4f4dc4," M: src/agentlab/agents/generic_agent/__init__.py
68+
M: src/agentlab/agents/generic_agent/agent_configs.py
69+
M: src/agentlab/analyze/agent_xray.py
70+
M: src/agentlab/llm/chat_api.py
71+
M: src/agentlab/llm/llm_configs.py",0.13.3,1d2d7160e5b7ec9954ecb48988f71eb56288dd29,"
72+
Leo Boisvert,GenericAgent-openai_o1-mini-2024-09-12,workarena_l1,0.4.1,2025-02-02_01-55-04,f3e1fcb8-5fc5-4115-9e00-27251508e2c7,0.518,0.028,5,330/330,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.3,1.44.0,v0.3.2,73baabee6d7ac37a5b8677c80baf83914a4f4dc4," M: src/agentlab/agents/generic_agent/__init__.py
73+
M: src/agentlab/agents/generic_agent/agent_configs.py
74+
M: src/agentlab/analyze/agent_xray.py
75+
M: src/agentlab/llm/llm_configs.py",0.13.3,1d2d7160e5b7ec9954ecb48988f71eb56288dd29,"

src/agentlab/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "v0.3.2"
1+
__version__ = "v0.4.0"

src/agentlab/agents/generic_agent/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,26 @@
1515
RANDOM_SEARCH_AGENT,
1616
AGENT_4o,
1717
AGENT_4o_MINI,
18+
AGENT_CLAUDE_SONNET_35,
1819
AGENT_4o_VISION,
20+
AGENT_o3_MINI,
21+
AGENT_o1_MINI,
1922
)
2023

2124
__all__ = [
2225
"AGENT_3_5",
2326
"AGENT_4o",
2427
"AGENT_4o_MINI",
2528
"AGENT_4o_VISION",
29+
"AGENT_o3_MINI",
30+
"AGENT_o1_MINI",
2631
"AGENT_LLAMA3_70B",
2732
"AGENT_LLAMA31_70B",
2833
"AGENT_8B",
2934
"RANDOM_SEARCH_AGENT",
3035
"AGENT_CUSTOM",
36+
"AGENT_CLAUDE_SONNET_35",
37+
"AGENT_4o_VISION",
38+
"AGENT_4o_MINI_VISION",
39+
"AGENT_CLAUDE_SONNET_35_VISION",
3140
]

src/agentlab/agents/generic_agent/agent_configs.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,20 @@
260260
chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"],
261261
flags=FLAGS_GPT_4o,
262262
)
263+
AGENT_CLAUDE_SONNET_35 = GenericAgentArgs(
264+
chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/anthropic/claude-3.5-sonnet:beta"],
265+
flags=FLAGS_GPT_4o,
266+
)
263267

268+
AGENT_o3_MINI = GenericAgentArgs(
269+
chat_model_args=CHAT_MODEL_ARGS_DICT["openai/o3-mini-2025-01-31"],
270+
flags=FLAGS_GPT_4o,
271+
)
272+
273+
AGENT_o1_MINI = GenericAgentArgs(
274+
chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/openai/o1-mini-2024-09-12"],
275+
flags=FLAGS_GPT_4o,
276+
)
264277
# GPT-4o vision default config
265278
FLAGS_GPT_4o_VISION = FLAGS_GPT_4o.copy()
266279
FLAGS_GPT_4o_VISION.obs.use_screenshot = True
@@ -271,6 +284,16 @@
271284
flags=FLAGS_GPT_4o_VISION,
272285
)
273286

287+
AGENT_4o_MINI_VISION = GenericAgentArgs(
288+
chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"],
289+
flags=FLAGS_GPT_4o_VISION,
290+
)
291+
292+
AGENT_CLAUDE_SONNET_35_VISION = GenericAgentArgs(
293+
chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/anthropic/claude-3.5-sonnet:beta"],
294+
flags=FLAGS_GPT_4o_VISION,
295+
)
296+
274297

275298
DEFAULT_RS_FLAGS = GenericPromptFlags(
276299
flag_group="default_rs",

src/agentlab/agents/generic_agent/generic_agent.py

Lines changed: 2 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,19 @@
1010

1111
from copy import deepcopy
1212
from dataclasses import asdict, dataclass
13-
from functools import partial
1413
from warnings import warn
1514

1615
import bgym
1716
from browsergym.experiments.agent import Agent, AgentInfo
1817

1918
from agentlab.agents import dynamic_prompting as dp
2019
from agentlab.agents.agent_args import AgentArgs
21-
from agentlab.llm.chat_api import BaseModelArgs, make_system_message, make_user_message
20+
from agentlab.llm.chat_api import BaseModelArgs
2221
from agentlab.llm.llm_utils import Discussion, ParseError, SystemMessage, retry
2322
from agentlab.llm.tracking import cost_tracker_decorator
2423

2524
from .generic_agent_prompt import GenericPromptFlags, MainPrompt
25+
from functools import partial
2626

2727

2828
@dataclass
@@ -200,82 +200,3 @@ def _get_maxes(self):
200200
else 20 # dangerous to change the default value here?
201201
)
202202
return max_prompt_tokens, max_trunc_itr
203-
204-
205-
from functools import partial
206-
207-
208-
def get_action_post_hoc(agent: GenericAgent, obs: dict, ans_dict: dict):
209-
"""
210-
Get the action post-hoc for the agent.
211-
212-
This function is used to get the action after the agent has already been run.
213-
Its goal is to recreate the prompt and the output of the agent a posteriori.
214-
The purpose is to build datasets for training the agents.
215-
216-
Args:
217-
agent (GenericAgent): The agent for which the action is being determined.
218-
obs (dict): The observation dictionary to append to the agent's history.
219-
ans_dict (dict): The answer dictionary containing the plan, step, memory, think, and action.
220-
221-
Returns:
222-
Tuple[str, str]: The complete prompt used for the agent and the reconstructed output based on the answer dictionary.
223-
"""
224-
system_prompt = dp.SystemPrompt().prompt
225-
226-
agent.obs_history.append(obs)
227-
228-
main_prompt = MainPrompt(
229-
action_set=agent.action_set,
230-
obs_history=agent.obs_history,
231-
actions=agent.actions,
232-
memories=agent.memories,
233-
thoughts=agent.thoughts,
234-
previous_plan=agent.plan,
235-
step=agent.plan_step,
236-
flags=agent.flags,
237-
)
238-
239-
max_prompt_tokens, max_trunc_itr = agent._get_maxes()
240-
241-
fit_function = partial(
242-
dp.fit_tokens,
243-
max_prompt_tokens=max_prompt_tokens,
244-
model_name=agent.chat_model_args.model_name,
245-
max_iterations=max_trunc_itr,
246-
)
247-
248-
instruction_prompt = fit_function(shrinkable=main_prompt)
249-
250-
if isinstance(instruction_prompt, list):
251-
# NOTE: this is when we have images
252-
instruction_prompt = instruction_prompt[0]["text"]
253-
254-
# TODO: make sure the bid is in the prompt
255-
256-
output = ""
257-
258-
# TODO: validate this
259-
agent.plan = ans_dict.get("plan", agent.plan)
260-
if agent.plan != "No plan yet":
261-
output += f"\n<plan>\n{agent.plan}\n</plan>\n"
262-
263-
# TODO: is plan_step something that the agent's outputs?
264-
agent.plan_step = ans_dict.get("step", agent.plan_step)
265-
266-
memory = ans_dict.get("memory", None)
267-
agent.memories.append(memory)
268-
if memory is not None:
269-
output += f"\n<memory>\n{memory}\n</memory>\n"
270-
271-
thought = ans_dict.get("think", None)
272-
agent.thoughts.append(thought)
273-
if thought is not None:
274-
output += f"\n<think>\n{thought}\n</think>\n"
275-
276-
action = ans_dict["action"]
277-
agent.actions.append(action)
278-
if action is not None:
279-
output += f"\n<action>\n{action}\n</action>"
280-
281-
return system_prompt, instruction_prompt, output

src/agentlab/agents/generic_agent/reproducibility_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
This module contains the classes and functions to reproduce the results of a
66
study. It is used to create a new study that will run the same experiments as
77
the original study, but with a reproducibility agent that will mimic the same
8-
answers as the original agent.
8+
answers as the original agent.
99
1010
Stats are collected to compare the original agent's answers with the new agent's
1111
answers. Load the this reproducibility study in agent-xray to compare the results.

src/agentlab/llm/chat_api.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,13 @@ def make_model(self):
143143
max_new_tokens=self.max_new_tokens,
144144
n_retry_server=self.n_retry_server,
145145
)
146+
elif self.backend == "vllm":
147+
return VLLMChatModel(
148+
model_name=self.model_name,
149+
temperature=self.temperature,
150+
max_tokens=self.max_new_tokens,
151+
n_retry_server=self.n_retry_server,
152+
)
146153
else:
147154
raise ValueError(f"Backend {self.backend} is not supported")
148155

@@ -429,3 +436,27 @@ def __init__(
429436

430437
client = InferenceClient(model=model_url, token=token)
431438
self.llm = partial(client.text_generation, max_new_tokens=max_new_tokens)
439+
440+
441+
class VLLMChatModel(ChatModel):
442+
def __init__(
443+
self,
444+
model_name,
445+
api_key=None,
446+
temperature=0.5,
447+
max_tokens=100,
448+
n_retry_server=4,
449+
min_retry_wait_time=60,
450+
):
451+
super().__init__(
452+
model_name=model_name,
453+
api_key=api_key,
454+
temperature=temperature,
455+
max_tokens=max_tokens,
456+
max_retry=n_retry_server,
457+
min_retry_wait_time=min_retry_wait_time,
458+
api_key_env_var="VLLM_API_KEY",
459+
client_class=OpenAI,
460+
client_args={"base_url": "http://0.0.0.0:8000/v1"},
461+
pricing_func=None,
462+
)

0 commit comments

Comments
 (0)