support browser automation (#36043)

bobogogo1990 · bobogogo · web-flow · commit 15302a3685bb · 2025-09-25T15:25:01.000-07:00
### Packages impacted by this PR ### Issues associated with this PR ### Describe the problem that is addressed by this PR ### What are the possible designs available to address the problem? If there are more than one possible design, why was the one in this PR chosen? ### Are there test cases added in this PR? _(If not, why?)_ ### Provide a list of related PRs _(if any)_ ### Command used to generate this PR:**_(Applicable only to SDK release request PRs)_ ### Checklists - [ ] Added impacted package name to the issue description - [ ] Does this PR needs any fixes in the SDK Generator?** _(If so, create an Issue in the [Autorest/typescript](https://github.com/Azure/autorest.typescript) repository and link it here)_ - [ ] Added a changelog (if necessary) --------- Co-authored-by: bobogogo1990 <bobobupt2014@163.com>
diff --git a/sdk/ai/ai-agents/CHANGELOG.md b/sdk/ai/ai-agents/CHANGELOG.md
@@ -1,14 +1,10 @@
 # Release History
 
-## 1.2.0-beta.2 (Unreleased)
+## 1.2.0-beta.2 (2025-09-26)
 
 ### Features Added
 
-### Breaking Changes
-
-### Bugs Fixed
-
-### Other Changes
+- Add `ToolUtility.createBrowserAutomationTool` to support browser automation tool in agent
 
 ## 1.2.0-beta.1 (2025-09-18)
 
diff --git a/sdk/ai/ai-agents/review/ai-agents-node.api.md b/sdk/ai/ai-agents/review/ai-agents-node.api.md
@@ -1799,6 +1799,9 @@ export class ToolUtility {
     static createBingGroundingTool(searchConfigurations: BingGroundingSearchConfiguration[]): {
         definition: BingGroundingToolDefinition;
     };
+    static createBrowserAutomationTool(connectionId: string): {
+        definition: BrowserAutomationToolDefinition;
+    };
     static createCodeInterpreterTool(fileIds?: string[], dataSources?: Array<VectorStoreDataSource>): {
         definition: CodeInterpreterToolDefinition;
         resources: ToolResources;
diff --git a/sdk/ai/ai-agents/samples-dev/agentsBrowserAutomation.ts b/sdk/ai/ai-agents/samples-dev/agentsBrowserAutomation.ts
@@ -0,0 +1,155 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+/**
+ * This sample demonstrates how to use agent operations with the Browser Automation tool from
+ * the Azure Agents service.
+ *
+ * @summary demonstrates how to use agent operations with the Browser Automation tool.
+ */
+
+import type {
+  MessageTextContent,
+  ThreadMessage,
+  RunStepToolCallDetails,
+  RunStepBrowserAutomationToolCall,
+  MessageTextUrlCitationAnnotation,
+} from "@azure/ai-agents";
+import { AgentsClient, isOutputOfType, ToolUtility } from "@azure/ai-agents";
+import { DefaultAzureCredential } from "@azure/identity";
+import "dotenv/config";
+
+const projectEndpoint = process.env["PROJECT_ENDPOINT"] || "<project endpoint>";
+const modelDeploymentName = process.env["MODEL_DEPLOYMENT_NAME"] || "gpt-4o";
+const azurePlaywrightConnectionId =
+  process.env["AZURE_PLAYWRIGHT_CONNECTION_ID"] || "<connection id>";
+
+export async function main(): Promise<void> {
+  const connectionId = azurePlaywrightConnectionId;
+
+  // Initialize Browser Automation tool and add the connection id
+  const browserAutomation = ToolUtility.createBrowserAutomationTool(connectionId);
+
+  // Create an Azure AI Agents Client
+  const client = new AgentsClient(projectEndpoint, new DefaultAzureCredential());
+
+  // Create a new Agent that has the Browser Automation tool attached.
+  const agent = await client.createAgent(modelDeploymentName, {
+    name: "my-agent",
+    instructions: `
+      You are an Agent helping with browser automation tasks. 
+      You can answer questions, provide information, and assist with various tasks 
+      related to web browsing using the Browser Automation tool available to you.
+    `,
+    tools: [browserAutomation.definition],
+  });
+
+  console.log(`Created agent, ID: ${agent.id}`);
+
+  // Create thread for communication
+  const thread = await client.threads.create();
+  console.log(`Created thread, ID: ${thread.id}`);
+
+  // Create message to thread
+  const message = await client.messages.create(
+    thread.id,
+    "user",
+    `
+      Your goal is to report the percent of Microsoft year-to-date stock price change.
+      To do that, go to the website finance.yahoo.com.
+      At the top of the page, you will find a search bar.
+      Enter the value 'MSFT', to get information about the Microsoft stock price.
+      At the top of the resulting page you will see a default chart of Microsoft stock price.
+      Click on 'YTD' at the top of that chart, and report the percent value that shows up just below it.
+    `,
+  );
+  console.log(`Created message, ID: ${message.id}`);
+
+  // Create and process agent run in thread with tools
+  console.log("Waiting for Agent run to complete. Please wait...");
+  const run = await client.runs.createAndPoll(thread.id, agent.id, {
+    pollingOptions: {
+      intervalInMs: 2000,
+    },
+  });
+
+  console.log(`Run finished with status: ${run.status}`);
+
+  if (run.status === "failed") {
+    console.log(`Run failed: ${JSON.stringify(run.lastError)}`);
+  }
+
+  // Fetch run steps to get the details of the agent run
+  const runStepsIterator = client.runSteps.list(thread.id, run.id);
+  console.log("\nRun Steps:");
+
+  for await (const step of runStepsIterator) {
+    console.log(`Step ${step.id} status: ${step.status}`);
+
+    if (isOutputOfType<RunStepToolCallDetails>(step.stepDetails, "tool_calls")) {
+      console.log("  Tool calls:");
+      const toolCalls = step.stepDetails.toolCalls;
+
+      for (const call of toolCalls) {
+        console.log(`    Tool call ID: ${call.id}`);
+        console.log(`    Tool call type: ${call.type}`);
+
+        if (isOutputOfType<RunStepBrowserAutomationToolCall>(call, "browser_automation")) {
+          console.log(`    Browser automation input: ${call.browserAutomation.input}`);
+          console.log(`    Browser automation output: ${call.browserAutomation.output}`);
+
+          console.log("    Steps:");
+          for (const toolStep of call.browserAutomation.steps) {
+            console.log(`      Last step result: ${toolStep.lastStepResult}`);
+            console.log(`      Current state: ${toolStep.currentState}`);
+            console.log(`      Next step: ${toolStep.nextStep}`);
+            console.log(); // add an extra newline between tool steps
+          }
+        }
+
+        console.log(); // add an extra newline between tool calls
+      }
+    }
+
+    console.log(); // add an extra newline between run steps
+  }
+
+  // Optional: Delete the agent once the run is finished.
+  // Comment out this line if you plan to reuse the agent later.
+  await client.deleteAgent(agent.id);
+  console.log("Deleted agent");
+
+  // Print the Agent's response message with optional citation
+  const messagesIterator = client.messages.list(thread.id);
+  const messages: ThreadMessage[] = [];
+
+  for await (const msg of messagesIterator) {
+    messages.unshift(msg); // Add to beginning to maintain chronological order
+  }
+
+  // Find the last assistant message
+  const responseMessage = messages.find(
+    (msg) => msg.role === "assistant" && msg.content.length > 0,
+  );
+
+  if (responseMessage) {
+    // Display URL citations if any
+    for (const content of responseMessage.content) {
+      if (isOutputOfType<MessageTextContent>(content, "text")) {
+        console.log(`Agent response: ${content.text.value}`);
+        for (const annotation of content.text.annotations || []) {
+          if (isOutputOfType<MessageTextUrlCitationAnnotation>(annotation, "url_citation")) {
+            console.log(
+              `URL Citation: [${annotation.urlCitation.title}](${annotation.urlCitation.url})`,
+            );
+          }
+        }
+      }
+    }
+  }
+}
+
+main().catch((err) => {
+  console.error("The sample encountered an error:", err);
+  process.exit(1);
+});
diff --git a/sdk/ai/ai-agents/samples/v1-beta/javascript/README.md b/sdk/ai/ai-agents/samples/v1-beta/javascript/README.md
@@ -19,6 +19,7 @@ These sample programs show how to use the JavaScript client libraries for Azure
 | [agentsBasics.js][agentsbasics]                                               | demonstrates how to use basic agent operations.                                                                            |
 | [agentsBingGrounding.js][agentsbinggrounding]                                 | demonstrates how to use agent operations with the Grounding with Bing Search tool.                                         |
 | [agentsBingGroundingWithStreaming.js][agentsbinggroundingwithstreaming]       | demonstrates how to use agent operations with the Grounding with Bing Search tool using streaming.                         |
+| [agentsBrowserAutomation.js][agentsbrowserautomation]                         | demonstrates how to use agent operations with the Browser Automation tool.                                                 |
 | [agentsConnectedAgents.js][agentsconnectedagents]                             | This sample demonstrates how to use Agent operations with the Connected Agent tool from the Azure Agents service.          |
 | [agentsImageInputWithBase64.js][agentsimageinputwithbase64]                   | This sample demonstrates how to use basic agent operations with image input (base64 encoded) for the Azure Agents service. |
 | [agentsImageInputWithFile.js][agentsimageinputwithfile]                       | This sample demonstrates how to use basic agent operations using image file input for the Azure Agents service.            |
@@ -90,6 +91,7 @@ Take a look at our [API Documentation][apiref] for more information about the AP
 [agentsbasics]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsBasics.js
 [agentsbinggrounding]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsBingGrounding.js
 [agentsbinggroundingwithstreaming]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsBingGroundingWithStreaming.js
+[agentsbrowserautomation]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsBrowserAutomation.js
 [agentsconnectedagents]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsConnectedAgents.js
 [agentsimageinputwithbase64]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsImageInputWithBase64.js
 [agentsimageinputwithfile]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsImageInputWithFile.js
diff --git a/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsBrowserAutomation.js b/sdk/ai/ai-agents/samples/v1-beta/javascript/agentsBrowserAutomation.js
@@ -0,0 +1,150 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+/**
+ * This sample demonstrates how to use agent operations with the Browser Automation tool from
+ * the Azure Agents service.
+ *
+ * @summary demonstrates how to use agent operations with the Browser Automation tool.
+ */
+
+const { AgentsClient, isOutputOfType, ToolUtility } = require("@azure/ai-agents");
+const { DefaultAzureCredential } = require("@azure/identity");
+require("dotenv/config");
+
+const projectEndpoint = process.env["PROJECT_ENDPOINT"] || "<project endpoint>";
+const modelDeploymentName = process.env["MODEL_DEPLOYMENT_NAME"] || "gpt-4o";
+const azurePlaywrightConnectionId =
+  process.env["AZURE_PLAYWRIGHT_CONNECTION_ID"] || "<connection id>";
+
+async function main() {
+  const connectionId = azurePlaywrightConnectionId;
+
+  // Initialize Browser Automation tool and add the connection id
+  const browserAutomation = ToolUtility.createBrowserAutomationTool(connectionId);
+
+  // Create an Azure AI Agents Client
+  const client = new AgentsClient(projectEndpoint, new DefaultAzureCredential());
+
+  // Create a new Agent that has the Browser Automation tool attached.
+  const agent = await client.createAgent(modelDeploymentName, {
+    name: "my-agent",
+    instructions: `
+      You are an Agent helping with browser automation tasks. 
+      You can answer questions, provide information, and assist with various tasks 
+      related to web browsing using the Browser Automation tool available to you.
+    `,
+    tools: [browserAutomation.definition],
+  });
+
+  console.log(`Created agent, ID: ${agent.id}`);
+
+  // Create thread for communication
+  const thread = await client.threads.create();
+  console.log(`Created thread, ID: ${thread.id}`);
+
+  // Create message to thread
+  const message = await client.messages.create(
+    thread.id,
+    "user",
+    `
+      Your goal is to report the percent of Microsoft year-to-date stock price change.
+      To do that, go to the website finance.yahoo.com.
+      At the top of the page, you will find a search bar.
+      Enter the value 'MSFT', to get information about the Microsoft stock price.
+      At the top of the resulting page you will see a default chart of Microsoft stock price.
+      Click on 'YTD' at the top of that chart, and report the percent value that shows up just below it.
+    `,
+  );
+  console.log(`Created message, ID: ${message.id}`);
+
+  // Create and process agent run in thread with tools
+  console.log("Waiting for Agent run to complete. Please wait...");
+  const run = await client.runs.createAndPoll(thread.id, agent.id, {
+    pollingOptions: {
+      intervalInMs: 2000,
+    },
+  });
+
+  console.log(`Run finished with status: ${run.status}`);
+
+  if (run.status === "failed") {
+    console.log(`Run failed: ${JSON.stringify(run.lastError)}`);
+  }
+
+  // Fetch run steps to get the details of the agent run
+  const runStepsIterator = client.runSteps.list(thread.id, run.id);
+  console.log("\nRun Steps:");
+
+  for await (const step of runStepsIterator) {
+    console.log(`Step ${step.id} status: ${step.status}`);
+
+    if (isOutputOfType(step.stepDetails, "tool_calls")) {
+      console.log("  Tool calls:");
+      const toolCalls = step.stepDetails.toolCalls;
+
+      for (const call of toolCalls) {
+        console.log(`    Tool call ID: ${call.id}`);
+        console.log(`    Tool call type: ${call.type}`);
+
+        if (isOutputOfType(call, "browser_automation")) {
+          console.log(`    Browser automation input: ${call.browserAutomation.input}`);
+          console.log(`    Browser automation output: ${call.browserAutomation.output}`);
+
+          console.log("    Steps:");
+          for (const toolStep of call.browserAutomation.steps) {
+            console.log(`      Last step result: ${toolStep.lastStepResult}`);
+            console.log(`      Current state: ${toolStep.currentState}`);
+            console.log(`      Next step: ${toolStep.nextStep}`);
+            console.log(); // add an extra newline between tool steps
+          }
+        }
+
+        console.log(); // add an extra newline between tool calls
+      }
+    }
+
+    console.log(); // add an extra newline between run steps
+  }
+
+  // Optional: Delete the agent once the run is finished.
+  // Comment out this line if you plan to reuse the agent later.
+  await client.deleteAgent(agent.id);
+  console.log("Deleted agent");
+
+  // Print the Agent's response message with optional citation
+  const messagesIterator = client.messages.list(thread.id);
+  const messages = [];
+
+  for await (const msg of messagesIterator) {
+    messages.unshift(msg); // Add to beginning to maintain chronological order
+  }
+
+  // Find the last assistant message
+  const responseMessage = messages.find(
+    (msg) => msg.role === "assistant" && msg.content.length > 0,
+  );
+
+  if (responseMessage) {
+    // Display URL citations if any
+    for (const content of responseMessage.content) {
+      if (isOutputOfType(content, "text")) {
+        console.log(`Agent response: ${content.text.value}`);
+        for (const annotation of content.text.annotations || []) {
+          if (isOutputOfType(annotation, "url_citation")) {
+            console.log(
+              `URL Citation: [${annotation.urlCitation.title}](${annotation.urlCitation.url})`,
+            );
+          }
+        }
+      }
+    }
+  }
+}
+
+main().catch((err) => {
+  console.error("The sample encountered an error:", err);
+  process.exit(1);
+});
+
+module.exports = { main };
diff --git a/sdk/ai/ai-agents/samples/v1-beta/typescript/README.md b/sdk/ai/ai-agents/samples/v1-beta/typescript/README.md
@@ -19,6 +19,7 @@ These sample programs show how to use the TypeScript client libraries for Azure
 | [agentsBasics.ts][agentsbasics]                                               | demonstrates how to use basic agent operations.                                                                            |
 | [agentsBingGrounding.ts][agentsbinggrounding]                                 | demonstrates how to use agent operations with the Grounding with Bing Search tool.                                         |
 | [agentsBingGroundingWithStreaming.ts][agentsbinggroundingwithstreaming]       | demonstrates how to use agent operations with the Grounding with Bing Search tool using streaming.                         |
+| [agentsBrowserAutomation.ts][agentsbrowserautomation]                         | demonstrates how to use agent operations with the Browser Automation tool.                                                 |
 | [agentsConnectedAgents.ts][agentsconnectedagents]                             | This sample demonstrates how to use Agent operations with the Connected Agent tool from the Azure Agents service.          |
 | [agentsImageInputWithBase64.ts][agentsimageinputwithbase64]                   | This sample demonstrates how to use basic agent operations with image input (base64 encoded) for the Azure Agents service. |
 | [agentsImageInputWithFile.ts][agentsimageinputwithfile]                       | This sample demonstrates how to use basic agent operations using image file input for the Azure Agents service.            |
@@ -102,6 +103,7 @@ Take a look at our [API Documentation][apiref] for more information about the AP
 [agentsbasics]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsBasics.ts
 [agentsbinggrounding]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsBingGrounding.ts
 [agentsbinggroundingwithstreaming]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsBingGroundingWithStreaming.ts
+[agentsbrowserautomation]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsBrowserAutomation.ts
 [agentsconnectedagents]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsConnectedAgents.ts
 [agentsimageinputwithbase64]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsImageInputWithBase64.ts
 [agentsimageinputwithfile]: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsImageInputWithFile.ts
diff --git a/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsBrowserAutomation.ts b/sdk/ai/ai-agents/samples/v1-beta/typescript/src/agentsBrowserAutomation.ts
diff --git a/sdk/ai/ai-agents/src/api/agentsContext.ts b/sdk/ai/ai-agents/src/api/agentsContext.ts
diff --git a/sdk/ai/ai-agents/src/utils/utils.ts b/sdk/ai/ai-agents/src/utils/utils.ts