diff --git a/strands_harness_optimizer/optimizers/system_prompt/base_agentic_optimizer.py b/strands_harness_optimizer/optimizers/system_prompt/base_agentic_optimizer.py index 1b39473..0957fe9 100644 --- a/strands_harness_optimizer/optimizers/system_prompt/base_agentic_optimizer.py +++ b/strands_harness_optimizer/optimizers/system_prompt/base_agentic_optimizer.py @@ -12,6 +12,7 @@ import random import shutil import tempfile +import time from typing import Optional from botocore.config import Config as BotocoreConfig @@ -125,6 +126,9 @@ def __init__( self._prompt_history: list[str] = [] self._sampled_indices: list[int] = [] + self.last_metrics: dict = {} + self.last_wall_clock_s: float = 0.0 + def _create_agent(self, system_prompt: str) -> Agent: """Create a strands Agent with shell tool, submit tool, and output guardrail.""" if self.boto_config: @@ -193,6 +197,26 @@ def _get_extra_tools(self) -> list: """Return additional tools for the agent. Override in subclasses.""" return [] + def _invoke_agent(self, agent: Agent, message: str): + """Invoke the agent and record wall-clock + Strands metrics on self.""" + t0 = time.time() + response = agent(message) + self.last_wall_clock_s = time.time() - t0 + + m = response.metrics + usage = m.accumulated_usage + self.last_metrics = { + "cycle_count": m.cycle_count, + "input_tokens": usage.get("inputTokens", 0), + "output_tokens": usage.get("outputTokens", 0), + "total_tokens": usage.get("totalTokens", 0), + "cache_read_tokens": usage.get("cacheReadInputTokens", 0), + "cache_write_tokens": usage.get("cacheWriteInputTokens", 0), + "latency_ms": m.accumulated_metrics.get("latencyMs", 0), + "tool_calls": {k: v.call_count for k, v in m.tool_metrics.items()}, + } + return response + def _get_submitted_params(self) -> Optional[dict]: """Get the params submitted via the tool, then reset.""" params = self._submitted_params diff --git a/strands_harness_optimizer/optimizers/system_prompt/contrastive_reflection.py b/strands_harness_optimizer/optimizers/system_prompt/contrastive_reflection.py index 7287a91..487697a 100644 --- a/strands_harness_optimizer/optimizers/system_prompt/contrastive_reflection.py +++ b/strands_harness_optimizer/optimizers/system_prompt/contrastive_reflection.py @@ -135,6 +135,6 @@ def _run_reflection(self, traces_folder: str, params: dict) -> None: task_message = self._task_message_template.render(**template_vars) agent = self._create_agent(system_prompt) - agent(task_message) + self._invoke_agent(agent, task_message) # get_state() and load_state() inherited from BaseAgenticOptimizer diff --git a/strands_harness_optimizer/rollout_engines/agentcore_engine.py b/strands_harness_optimizer/rollout_engines/agentcore_engine.py index fff777d..bd9a69e 100644 --- a/strands_harness_optimizer/rollout_engines/agentcore_engine.py +++ b/strands_harness_optimizer/rollout_engines/agentcore_engine.py @@ -7,7 +7,7 @@ import json import logging import uuid -from typing import Iterator, Optional +from typing import Callable, Iterator, Optional import boto3 from botocore.config import Config as BotocoreConfig @@ -17,6 +17,8 @@ from ..utils.parallel_rollout import expand_for_num_rollouts, run_parallel from .agent_rollout_engine import AgentRolloutEngine +PayloadMapper = Callable[[dict], dict] + logger = logging.getLogger(__name__) _DEFAULT_BOTO_CONFIG = { @@ -108,6 +110,12 @@ class AgentCoreRolloutEngine(AgentRolloutEngine): The runtime must already be deployed. Formula parameters are synced via ensure_sync_params() and included in each invocation payload. + The engine builds a canonical ``{"data_sample": ..., "params": ...}`` + payload per invocation. ``payload_mapper`` is an optional transform + applied to that dict before it goes on the wire — use it when the + deployed runtime expects a different shape (flat fields, renamed keys, + nested envelopes). + Args: formula: The Formula being optimized. agent_arn: ARN of the deployed AgentCore agent runtime. @@ -115,13 +123,7 @@ class AgentCoreRolloutEngine(AgentRolloutEngine): boto_config: BotocoreConfig or dict to merge with defaults. num_rollouts: Default number of rollouts per data sample. num_workers: Number of parallel workers for concurrent invocations. - - Example: - engine = AgentCoreRolloutEngine( - formula=formula, - agent_arn="arn:aws:bedrock-agentcore:us-west-2:123:runtime/abc", - num_workers=4, - ) + payload_mapper: Optional transform of the canonical payload. """ def __init__( @@ -132,6 +134,7 @@ def __init__( boto_config=None, num_rollouts: int = 1, num_workers: int = 1, + payload_mapper: Optional[PayloadMapper] = None, ): super().__init__(formula, num_rollouts) self.num_workers = num_workers @@ -141,6 +144,7 @@ def __init__( region_name=region_name, boto_config=boto_config, ) + self._payload_mapper = payload_mapper logger.info(f"Initialized AgentCoreRolloutEngine (num_workers={num_workers})") @@ -157,10 +161,9 @@ def generate_batch(self, data_samples: list[dict]) -> Iterator[Rollout]: def _invoke_runtime(self, data_sample: dict) -> Rollout: """Invoke the AgentCore runtime for a single data sample.""" - payload = { - "data_sample": data_sample, - "params": self._synced_params, - } + payload = {"data_sample": data_sample, "params": self._synced_params} + if self._payload_mapper is not None: + payload = self._payload_mapper(payload) response_data = self._client.invoke(payload) diff --git a/strands_harness_optimizer/templates/contrastive_reflection_v2/system_prompt.jinja b/strands_harness_optimizer/templates/contrastive_reflection_v2/system_prompt.jinja new file mode 100644 index 0000000..db1bfa3 --- /dev/null +++ b/strands_harness_optimizer/templates/contrastive_reflection_v2/system_prompt.jinja @@ -0,0 +1,24 @@ +You are an expert prompt engineer. Your task is to analyze agent execution trajectories and generate an optimized system prompt through contrastive learning. + +## Context Window Management + +Trajectory files can be large. Before reading ANY file: +```bash +ls -lh +``` +- Files < 10KB: Safe to read fully +- Files > 10KB: Use `head`, `tail`, `grep` with limits + +## Your Task + +1. Analyze the provided trajectories (successful vs failed) +2. Extract actionable insights from the differences +3. **INTEGRATE** these insights into the original system prompt's existing structure + +## CRITICAL: Output Format + +You must output a **REVISED** system prompt that reads as a single coherent document — not `[original] + appendix`. + +- Preserve structural blocks verbatim (tool schemas, output formats, few-shot examples) +- Rewrite prose sections to weave in insights where they belong +- Submit the revised prompt via the `submit_optimized_params` tool (see end of this prompt for tool usage) diff --git a/strands_harness_optimizer/templates/contrastive_reflection_v2/task_message_system_prompt.jinja b/strands_harness_optimizer/templates/contrastive_reflection_v2/task_message_system_prompt.jinja new file mode 100644 index 0000000..774b4dc --- /dev/null +++ b/strands_harness_optimizer/templates/contrastive_reflection_v2/task_message_system_prompt.jinja @@ -0,0 +1,161 @@ +Analyze the trajectories and generate an optimized system prompt by **merging insights into the original**, not by appending a separate section. + +## Inputs + +**Trajectory Folder**: {{ traces_folder }} + +**Original System Prompt to Optimize**: + +{{ params.get("system_prompt", "") }} + + +--- + +## CRITICAL: Output Requirements + +Your output is a **revised system prompt** that integrates learned insights into the original's existing structure. It should read as a single coherent document — not `[original] + "## Learned Behaviors"`. + +### 1. Strict Preservation (verbatim — do NOT reword, reorder, or drop) + +The following blocks from the original MUST appear in your output character-for-character identical: + +- **Tool names** (every tool the original references) +- **Tool input/output schemas** (JSON schemas, argument descriptions, type signatures) +- **Output format specifications** (required response structure, required fields, required delimiters, JSON/YAML templates) +- **Few-shot examples** (input/output pairs, demonstration dialogues) +- **Any block the original marks as "do not modify", "required format", "template", or similar** + +If you are uncertain whether a block is structural vs. prose, treat it as structural and preserve verbatim. + +### 2. May Rewrite (integrate insights here) + +- Prose instructions and guidance +- Explanations and rationale +- Section headings and ordering (as long as structural blocks stay intact) +- Bullet lists of do's/don'ts — you may add, remove, or edit bullets + +When integrating a new insight, place it in the section of the original prompt it relates to (tool usage, error handling, task decomposition, etc.). Only create a new section for topics the original does not cover at all. + +### 3. Safety Invariant (non-negotiable) + +The revised prompt MUST require the agent to confirm with the user before taking any action with real-world consequences (writes, sends, purchases, external state changes, irreversible operations). + +You MUST NOT introduce, and MUST REMOVE if present, any guidance that: +- Tells the agent to act "immediately", "autonomously", or "without confirmation" +- Treats user silence or ambiguity as approval +- Instructs the agent to skip, bypass, or avoid confirmation steps +- Trades user control for speed, convenience, or higher trace reward + +If the original prompt does not already specify a confirmation policy, ADD one. Suggested wording (adapt to the original's voice): + +> Before taking any action with real-world consequences, state the planned action in plain language and wait for explicit user approval. Do not treat silence as consent. + +If traces appear to reward a behavior that conflicts with this invariant, treat it as a data quality / reward-model issue — do NOT encode it as a learned behavior. + +### 4. Length Guardrail + +{%- set _orig = params.get("system_prompt", "") -%} +{%- set _orig_len = _orig | length -%} +{%- set _max_len = [_orig_len + 500, (_orig_len * 1.1) | int] | max -%} +Original prompt length: **{{ _orig_len }} characters**. +Final prompt length MUST NOT exceed **{{ _max_len }} characters** (max of 1.1x the original length and original + 500). Prefer integrating insights concisely over adding new sections. If you run out of budget, prioritize the highest-impact insights and drop the rest. + +--- + +## Analysis Steps + +**Step 1: Check file sizes** +```bash +ls -lh {{ traces_folder }}/*.json | head -20 +``` + +**Step 2: Categorize traces by outcome** +```bash +for f in {{ traces_folder }}/*.json; do + echo -n "$f: " + grep -oE '"reward":\s*[0-9.-]+' "$f" | head -1 +done 2>/dev/null | head -30 +``` + +**Step 3: Examine sufficient traces** + +| Folder Size | Minimum Coverage | +|-------------|------------------| +| ≤ 10 traces | Examine ALL | +| 11-30 traces | At least 70% | +| > 30 traces | At least 50% | + +Balance coverage between successful and failed traces. + +**Step 4: Extract insights from differences** + +Focus on: +- What do successful agents do that failed ones don't? +- What mistakes do failed agents make? +- What patterns appear across MULTIPLE traces (not one-off issues)? + +**Step 5: Place each insight where it belongs** + +For each insight: +1. Identify which section of the original prompt it relates to +2. Integrate it there — add a bullet, edit existing guidance, or insert a sentence +3. Only create a new section if the original does not cover the topic + +Do NOT collect insights into a single "Learned Behaviors" appendix. + +**Step 6: Write the revised prompt to a file and submit** + +Write your final revised system prompt to `/tmp/optimized_system_prompt.txt` (plain text, the prompt body only — no YAML wrapper, no markdown fence): + +```bash +cat > /tmp/optimized_system_prompt.txt << 'PROMPT_EOF' +[your revised prompt here — single coherent document] +PROMPT_EOF +``` + +Then call: +``` +submit_optimized_params(file_path_dict={"system_prompt": "/tmp/optimized_system_prompt.txt"}) +``` + +**Step 7: Self-review (structural preservation)** + +Re-read `/tmp/optimized_system_prompt.txt` and verify: + +- [ ] Every tool name from the original appears in the output +- [ ] Every tool schema from the original appears character-for-character identical +- [ ] Every output format specification appears character-for-character identical +- [ ] Every few-shot example from the original appears character-for-character identical +- [ ] No block marked "do not modify" / "required format" / "template" has been altered + +If any check fails, restore the missing/altered content from the original, rewrite the file, and re-submit via `submit_optimized_params`. + +**Step 8: Self-review (safety invariant)** + +Re-read the output and verify: + +- [ ] The revised prompt requires user confirmation before consequential actions (either preserved from original, or added if the original lacked it) +- [ ] No line instructs the agent to skip, bypass, or avoid asking for confirmation +- [ ] No line treats silence or ambiguity as approval +- [ ] No line instructs the agent to act "immediately" / "autonomously" / "without confirmation" on behalf of the user (unless the original EXPLICITLY authorizes autonomous operation for a specific action) + +Remove any offending lines in-place, rewrite the file, and re-submit via `submit_optimized_params`. + +--- + +## Criteria for Good Insights + +### Actionability +Each integrated insight should imply: +- A clear **trigger condition** (WHEN...) +- A specific **action** (DO...) + +Reject vague insights like "be careful" or "consider edge cases". + +### Coverage +- Address different failure modes observed across traces +- Prioritize patterns that appear in multiple traces (not one-off issues) + +### Appropriate Generalization +- Keep actions concrete and specific +- Generalize trigger conditions only when the pattern applies broadly