Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import random
import shutil
import tempfile
import time
from typing import Optional

from botocore.config import Config as BotocoreConfig
Expand Down Expand Up @@ -125,6 +126,9 @@ def __init__(
self._prompt_history: list[str] = []
self._sampled_indices: list[int] = []

self.last_metrics: dict = {}
self.last_wall_clock_s: float = 0.0

def _create_agent(self, system_prompt: str) -> Agent:
"""Create a strands Agent with shell tool, submit tool, and output guardrail."""
if self.boto_config:
Expand Down Expand Up @@ -193,6 +197,26 @@ def _get_extra_tools(self) -> list:
"""Return additional tools for the agent. Override in subclasses."""
return []

def _invoke_agent(self, agent: Agent, message: str):
Comment thread
jnzs1836 marked this conversation as resolved.
"""Invoke the agent and record wall-clock + Strands metrics on self."""
t0 = time.time()
Comment thread
jnzs1836 marked this conversation as resolved.
response = agent(message)
self.last_wall_clock_s = time.time() - t0

m = response.metrics
usage = m.accumulated_usage
self.last_metrics = {
"cycle_count": m.cycle_count,
"input_tokens": usage.get("inputTokens", 0),
"output_tokens": usage.get("outputTokens", 0),
"total_tokens": usage.get("totalTokens", 0),
"cache_read_tokens": usage.get("cacheReadInputTokens", 0),
"cache_write_tokens": usage.get("cacheWriteInputTokens", 0),
"latency_ms": m.accumulated_metrics.get("latencyMs", 0),
"tool_calls": {k: v.call_count for k, v in m.tool_metrics.items()},
}
return response
Comment thread
jnzs1836 marked this conversation as resolved.
Comment thread
jnzs1836 marked this conversation as resolved.

def _get_submitted_params(self) -> Optional[dict]:
"""Get the params submitted via the tool, then reset."""
params = self._submitted_params
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,6 @@ def _run_reflection(self, traces_folder: str, params: dict) -> None:
task_message = self._task_message_template.render(**template_vars)

agent = self._create_agent(system_prompt)
agent(task_message)
self._invoke_agent(agent, task_message)

# get_state() and load_state() inherited from BaseAgenticOptimizer
27 changes: 15 additions & 12 deletions strands_harness_optimizer/rollout_engines/agentcore_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import json
import logging
import uuid
from typing import Iterator, Optional
from typing import Callable, Iterator, Optional

import boto3
from botocore.config import Config as BotocoreConfig
Expand All @@ -17,6 +17,8 @@
from ..utils.parallel_rollout import expand_for_num_rollouts, run_parallel
from .agent_rollout_engine import AgentRolloutEngine

PayloadMapper = Callable[[dict], dict]

logger = logging.getLogger(__name__)

_DEFAULT_BOTO_CONFIG = {
Expand Down Expand Up @@ -108,20 +110,20 @@ class AgentCoreRolloutEngine(AgentRolloutEngine):
The runtime must already be deployed. Formula parameters are synced
via ensure_sync_params() and included in each invocation payload.

The engine builds a canonical ``{"data_sample": ..., "params": ...}``
payload per invocation. ``payload_mapper`` is an optional transform
applied to that dict before it goes on the wire — use it when the
deployed runtime expects a different shape (flat fields, renamed keys,
nested envelopes).

Args:
formula: The Formula being optimized.
agent_arn: ARN of the deployed AgentCore agent runtime.
region_name: AWS region where the runtime is deployed.
boto_config: BotocoreConfig or dict to merge with defaults.
num_rollouts: Default number of rollouts per data sample.
num_workers: Number of parallel workers for concurrent invocations.

Example:
engine = AgentCoreRolloutEngine(
formula=formula,
agent_arn="arn:aws:bedrock-agentcore:us-west-2:123:runtime/abc",
num_workers=4,
)
payload_mapper: Optional transform of the canonical payload.
"""

def __init__(
Expand All @@ -132,6 +134,7 @@ def __init__(
boto_config=None,
num_rollouts: int = 1,
num_workers: int = 1,
payload_mapper: Optional[PayloadMapper] = None,
):
super().__init__(formula, num_rollouts)
self.num_workers = num_workers
Expand All @@ -141,6 +144,7 @@ def __init__(
region_name=region_name,
boto_config=boto_config,
)
self._payload_mapper = payload_mapper

logger.info(f"Initialized AgentCoreRolloutEngine (num_workers={num_workers})")

Expand All @@ -157,10 +161,9 @@ def generate_batch(self, data_samples: list[dict]) -> Iterator[Rollout]:

def _invoke_runtime(self, data_sample: dict) -> Rollout:
"""Invoke the AgentCore runtime for a single data sample."""
payload = {
"data_sample": data_sample,
"params": self._synced_params,
}
payload = {"data_sample": data_sample, "params": self._synced_params}
if self._payload_mapper is not None:
payload = self._payload_mapper(payload)
Comment thread
jnzs1836 marked this conversation as resolved.
Comment thread
jnzs1836 marked this conversation as resolved.
Comment thread
jnzs1836 marked this conversation as resolved.

Comment thread
jnzs1836 marked this conversation as resolved.
response_data = self._client.invoke(payload)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
You are an expert prompt engineer. Your task is to analyze agent execution trajectories and generate an optimized system prompt through contrastive learning.
Comment thread
jnzs1836 marked this conversation as resolved.

## Context Window Management

Trajectory files can be large. Before reading ANY file:
```bash
ls -lh <file_path>
```
- Files < 10KB: Safe to read fully
- Files > 10KB: Use `head`, `tail`, `grep` with limits

## Your Task

1. Analyze the provided trajectories (successful vs failed)
2. Extract actionable insights from the differences
3. **INTEGRATE** these insights into the original system prompt's existing structure

## CRITICAL: Output Format

You must output a **REVISED** system prompt that reads as a single coherent document — not `[original] + appendix`.

- Preserve structural blocks verbatim (tool schemas, output formats, few-shot examples)
- Rewrite prose sections to weave in insights where they belong
- Submit the revised prompt via the `submit_optimized_params` tool (see end of this prompt for tool usage)
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
Analyze the trajectories and generate an optimized system prompt by **merging insights into the original**, not by appending a separate section.

## Inputs

**Trajectory Folder**: {{ traces_folder }}

**Original System Prompt to Optimize**:
<original_prompt>
{{ params.get("system_prompt", "") }}
</original_prompt>

---

## CRITICAL: Output Requirements

Your output is a **revised system prompt** that integrates learned insights into the original's existing structure. It should read as a single coherent document — not `[original] + "## Learned Behaviors"`.

### 1. Strict Preservation (verbatim — do NOT reword, reorder, or drop)

The following blocks from the original MUST appear in your output character-for-character identical:

- **Tool names** (every tool the original references)
- **Tool input/output schemas** (JSON schemas, argument descriptions, type signatures)
- **Output format specifications** (required response structure, required fields, required delimiters, JSON/YAML templates)
- **Few-shot examples** (input/output pairs, demonstration dialogues)
- **Any block the original marks as "do not modify", "required format", "template", or similar**

If you are uncertain whether a block is structural vs. prose, treat it as structural and preserve verbatim.

### 2. May Rewrite (integrate insights here)

- Prose instructions and guidance
- Explanations and rationale
- Section headings and ordering (as long as structural blocks stay intact)
- Bullet lists of do's/don'ts — you may add, remove, or edit bullets

When integrating a new insight, place it in the section of the original prompt it relates to (tool usage, error handling, task decomposition, etc.). Only create a new section for topics the original does not cover at all.

### 3. Safety Invariant (non-negotiable)

The revised prompt MUST require the agent to confirm with the user before taking any action with real-world consequences (writes, sends, purchases, external state changes, irreversible operations).

You MUST NOT introduce, and MUST REMOVE if present, any guidance that:
- Tells the agent to act "immediately", "autonomously", or "without confirmation"
- Treats user silence or ambiguity as approval
- Instructs the agent to skip, bypass, or avoid confirmation steps
- Trades user control for speed, convenience, or higher trace reward

If the original prompt does not already specify a confirmation policy, ADD one. Suggested wording (adapt to the original's voice):

> Before taking any action with real-world consequences, state the planned action in plain language and wait for explicit user approval. Do not treat silence as consent.

If traces appear to reward a behavior that conflicts with this invariant, treat it as a data quality / reward-model issue — do NOT encode it as a learned behavior.

### 4. Length Guardrail

{%- set _orig = params.get("system_prompt", "") -%}
{%- set _orig_len = _orig | length -%}
{%- set _max_len = [_orig_len + 500, (_orig_len * 1.1) | int] | max -%}
Original prompt length: **{{ _orig_len }} characters**.
Final prompt length MUST NOT exceed **{{ _max_len }} characters** (max of 1.1x the original length and original + 500). Prefer integrating insights concisely over adding new sections. If you run out of budget, prioritize the highest-impact insights and drop the rest.

---

## Analysis Steps

**Step 1: Check file sizes**
```bash
ls -lh {{ traces_folder }}/*.json | head -20
```

**Step 2: Categorize traces by outcome**
```bash
for f in {{ traces_folder }}/*.json; do
echo -n "$f: "
grep -oE '"reward":\s*[0-9.-]+' "$f" | head -1
done 2>/dev/null | head -30
```

**Step 3: Examine sufficient traces**

| Folder Size | Minimum Coverage |
|-------------|------------------|
| ≤ 10 traces | Examine ALL |
| 11-30 traces | At least 70% |
| > 30 traces | At least 50% |

Balance coverage between successful and failed traces.

**Step 4: Extract insights from differences**

Focus on:
- What do successful agents do that failed ones don't?
- What mistakes do failed agents make?
- What patterns appear across MULTIPLE traces (not one-off issues)?

**Step 5: Place each insight where it belongs**

For each insight:
1. Identify which section of the original prompt it relates to
2. Integrate it there — add a bullet, edit existing guidance, or insert a sentence
3. Only create a new section if the original does not cover the topic

Do NOT collect insights into a single "Learned Behaviors" appendix.

**Step 6: Write the revised prompt to a file and submit**

Write your final revised system prompt to `/tmp/optimized_system_prompt.txt` (plain text, the prompt body only — no YAML wrapper, no markdown fence):

```bash
cat > /tmp/optimized_system_prompt.txt << 'PROMPT_EOF'
[your revised prompt here — single coherent document]
PROMPT_EOF
```

Then call:
```
submit_optimized_params(file_path_dict={"system_prompt": "/tmp/optimized_system_prompt.txt"})
```

**Step 7: Self-review (structural preservation)**

Re-read `/tmp/optimized_system_prompt.txt` and verify:

- [ ] Every tool name from the original appears in the output
- [ ] Every tool schema from the original appears character-for-character identical
- [ ] Every output format specification appears character-for-character identical
- [ ] Every few-shot example from the original appears character-for-character identical
- [ ] No block marked "do not modify" / "required format" / "template" has been altered

If any check fails, restore the missing/altered content from the original, rewrite the file, and re-submit via `submit_optimized_params`.

**Step 8: Self-review (safety invariant)**

Re-read the output and verify:

- [ ] The revised prompt requires user confirmation before consequential actions (either preserved from original, or added if the original lacked it)
- [ ] No line instructs the agent to skip, bypass, or avoid asking for confirmation
- [ ] No line treats silence or ambiguity as approval
- [ ] No line instructs the agent to act "immediately" / "autonomously" / "without confirmation" on behalf of the user (unless the original EXPLICITLY authorizes autonomous operation for a specific action)

Remove any offending lines in-place, rewrite the file, and re-submit via `submit_optimized_params`.

---

## Criteria for Good Insights

### Actionability
Each integrated insight should imply:
- A clear **trigger condition** (WHEN...)
- A specific **action** (DO...)

Reject vague insights like "be careful" or "consider edge cases".

### Coverage
- Address different failure modes observed across traces
- Prioritize patterns that appear in multiple traces (not one-off issues)

### Appropriate Generalization
- Keep actions concrete and specific
- Generalize trigger conditions only when the pattern applies broadly
Loading