strands-labs · jnzs1836 · May 28, 2026
diff --git a/strands_harness_optimizer/optimizers/system_prompt/base_agentic_optimizer.py b/strands_harness_optimizer/optimizers/system_prompt/base_agentic_optimizer.py
@@ -12,6 +12,7 @@
 import random
 import shutil
 import tempfile
+import time
 from typing import Optional
 
 from botocore.config import Config as BotocoreConfig
@@ -125,6 +126,9 @@ def __init__(
         self._prompt_history: list[str] = []
         self._sampled_indices: list[int] = []
 
+        self.last_metrics: dict = {}
+        self.last_wall_clock_s: float = 0.0
+
     def _create_agent(self, system_prompt: str) -> Agent:
         """Create a strands Agent with shell tool, submit tool, and output guardrail."""
         if self.boto_config:
@@ -193,6 +197,26 @@ def _get_extra_tools(self) -> list:
         """Return additional tools for the agent. Override in subclasses."""
         return []
 
+    def _invoke_agent(self, agent: Agent, message: str):
+        """Invoke the agent and record wall-clock + Strands metrics on self."""
+        t0 = time.time()
+        response = agent(message)
+        self.last_wall_clock_s = time.time() - t0
+
+        m = response.metrics
+        usage = m.accumulated_usage
+        self.last_metrics = {
+            "cycle_count": m.cycle_count,
+            "input_tokens": usage.get("inputTokens", 0),
+            "output_tokens": usage.get("outputTokens", 0),
+            "total_tokens": usage.get("totalTokens", 0),
+            "cache_read_tokens": usage.get("cacheReadInputTokens", 0),
+            "cache_write_tokens": usage.get("cacheWriteInputTokens", 0),
+            "latency_ms": m.accumulated_metrics.get("latencyMs", 0),
+            "tool_calls": {k: v.call_count for k, v in m.tool_metrics.items()},
+        }
+        return response
+
     def _get_submitted_params(self) -> Optional[dict]:
         """Get the params submitted via the tool, then reset."""
         params = self._submitted_params

diff --git a/strands_harness_optimizer/optimizers/system_prompt/contrastive_reflection.py b/strands_harness_optimizer/optimizers/system_prompt/contrastive_reflection.py
@@ -135,6 +135,6 @@ def _run_reflection(self, traces_folder: str, params: dict) -> None:
         task_message = self._task_message_template.render(**template_vars)
 
         agent = self._create_agent(system_prompt)
-        agent(task_message)
+        self._invoke_agent(agent, task_message)
 
     # get_state() and load_state() inherited from BaseAgenticOptimizer
diff --git a/strands_harness_optimizer/rollout_engines/agentcore_engine.py b/strands_harness_optimizer/rollout_engines/agentcore_engine.py
@@ -7,7 +7,7 @@
 import json
 import logging
 import uuid
-from typing import Iterator, Optional
+from typing import Callable, Iterator, Optional
 
 import boto3
 from botocore.config import Config as BotocoreConfig
@@ -17,6 +17,8 @@
 from ..utils.parallel_rollout import expand_for_num_rollouts, run_parallel
 from .agent_rollout_engine import AgentRolloutEngine
 
+PayloadMapper = Callable[[dict], dict]
+
 logger = logging.getLogger(__name__)
 
 _DEFAULT_BOTO_CONFIG = {
@@ -108,20 +110,20 @@ class AgentCoreRolloutEngine(AgentRolloutEngine):
     The runtime must already be deployed. Formula parameters are synced
     via ensure_sync_params() and included in each invocation payload.
 
+    The engine builds a canonical ``{"data_sample": ..., "params": ...}``
+    payload per invocation. ``payload_mapper`` is an optional transform
+    applied to that dict before it goes on the wire — use it when the
+    deployed runtime expects a different shape (flat fields, renamed keys,
+    nested envelopes).
+
     Args:
         formula: The Formula being optimized.
         agent_arn: ARN of the deployed AgentCore agent runtime.
         region_name: AWS region where the runtime is deployed.
         boto_config: BotocoreConfig or dict to merge with defaults.
         num_rollouts: Default number of rollouts per data sample.
         num_workers: Number of parallel workers for concurrent invocations.
-
-    Example:
-        engine = AgentCoreRolloutEngine(
-            formula=formula,
-            agent_arn="arn:aws:bedrock-agentcore:us-west-2:123:runtime/abc",
-            num_workers=4,
-        )
+        payload_mapper: Optional transform of the canonical payload.
     """
 
     def __init__(
@@ -132,6 +134,7 @@ def __init__(
         boto_config=None,
         num_rollouts: int = 1,
         num_workers: int = 1,
+        payload_mapper: Optional[PayloadMapper] = None,
     ):
         super().__init__(formula, num_rollouts)
         self.num_workers = num_workers
@@ -141,6 +144,7 @@ def __init__(
             region_name=region_name,
             boto_config=boto_config,
         )
+        self._payload_mapper = payload_mapper
 
         logger.info(f"Initialized AgentCoreRolloutEngine (num_workers={num_workers})")
 
@@ -157,10 +161,9 @@ def generate_batch(self, data_samples: list[dict]) -> Iterator[Rollout]:
 
     def _invoke_runtime(self, data_sample: dict) -> Rollout:
         """Invoke the AgentCore runtime for a single data sample."""
-        payload = {
-            "data_sample": data_sample,
-            "params": self._synced_params,
-        }
+        payload = {"data_sample": data_sample, "params": self._synced_params}
+        if self._payload_mapper is not None:
+            payload = self._payload_mapper(payload)
 
         response_data = self._client.invoke(payload)
 

diff --git a/strands_harness_optimizer/templates/contrastive_reflection_v2/system_prompt.jinja b/strands_harness_optimizer/templates/contrastive_reflection_v2/system_prompt.jinja
@@ -0,0 +1,24 @@
+You are an expert prompt engineer. Your task is to analyze agent execution trajectories and generate an optimized system prompt through contrastive learning.
+
+## Context Window Management
+
+Trajectory files can be large. Before reading ANY file:
+```bash
+ls -lh <file_path>
+```
+- Files < 10KB: Safe to read fully
+- Files > 10KB: Use `head`, `tail`, `grep` with limits
+
+## Your Task
+
+1. Analyze the provided trajectories (successful vs failed)
+2. Extract actionable insights from the differences
+3. **INTEGRATE** these insights into the original system prompt's existing structure
+
+## CRITICAL: Output Format
+
+You must output a **REVISED** system prompt that reads as a single coherent document — not `[original] + appendix`.
+
+- Preserve structural blocks verbatim (tool schemas, output formats, few-shot examples)
+- Rewrite prose sections to weave in insights where they belong
+- Submit the revised prompt via the `submit_optimized_params` tool (see end of this prompt for tool usage)
diff --git a/...ds_harness_optimizer/templates/contrastive_reflection_v2/task_message_system_prompt.jinja b/...ds_harness_optimizer/templates/contrastive_reflection_v2/task_message_system_prompt.jinja
@@ -0,0 +1,161 @@
+Analyze the trajectories and generate an optimized system prompt by **merging insights into the original**, not by appending a separate section.
+
+## Inputs
+
+**Trajectory Folder**: {{ traces_folder }}
+
+**Original System Prompt to Optimize**:
+<original_prompt>
+{{ params.get("system_prompt", "") }}
+</original_prompt>
+
+---
+
+## CRITICAL: Output Requirements
+
+Your output is a **revised system prompt** that integrates learned insights into the original's existing structure. It should read as a single coherent document — not `[original] + "## Learned Behaviors"`.
+
+### 1. Strict Preservation (verbatim — do NOT reword, reorder, or drop)
+
+The following blocks from the original MUST appear in your output character-for-character identical:
+
+- **Tool names** (every tool the original references)
+- **Tool input/output schemas** (JSON schemas, argument descriptions, type signatures)
+- **Output format specifications** (required response structure, required fields, required delimiters, JSON/YAML templates)
+- **Few-shot examples** (input/output pairs, demonstration dialogues)
+- **Any block the original marks as "do not modify", "required format", "template", or similar**
+
+If you are uncertain whether a block is structural vs. prose, treat it as structural and preserve verbatim.
+
+### 2. May Rewrite (integrate insights here)
+
+- Prose instructions and guidance
+- Explanations and rationale
+- Section headings and ordering (as long as structural blocks stay intact)
+- Bullet lists of do's/don'ts — you may add, remove, or edit bullets
+
+When integrating a new insight, place it in the section of the original prompt it relates to (tool usage, error handling, task decomposition, etc.). Only create a new section for topics the original does not cover at all.
+
+### 3. Safety Invariant (non-negotiable)
+
+The revised prompt MUST require the agent to confirm with the user before taking any action with real-world consequences (writes, sends, purchases, external state changes, irreversible operations).
+
+You MUST NOT introduce, and MUST REMOVE if present, any guidance that:
+- Tells the agent to act "immediately", "autonomously", or "without confirmation"
+- Treats user silence or ambiguity as approval
+- Instructs the agent to skip, bypass, or avoid confirmation steps
+- Trades user control for speed, convenience, or higher trace reward
+
+If the original prompt does not already specify a confirmation policy, ADD one. Suggested wording (adapt to the original's voice):
+
+> Before taking any action with real-world consequences, state the planned action in plain language and wait for explicit user approval. Do not treat silence as consent.
+
+If traces appear to reward a behavior that conflicts with this invariant, treat it as a data quality / reward-model issue — do NOT encode it as a learned behavior.
+
+### 4. Length Guardrail
+
+{%- set _orig = params.get("system_prompt", "") -%}
+{%- set _orig_len = _orig | length -%}
+{%- set _max_len = [_orig_len + 500, (_orig_len * 1.1) | int] | max -%}
+Original prompt length: **{{ _orig_len }} characters**.
+Final prompt length MUST NOT exceed **{{ _max_len }} characters** (max of 1.1x the original length and original + 500). Prefer integrating insights concisely over adding new sections. If you run out of budget, prioritize the highest-impact insights and drop the rest.
+
+---
+
+## Analysis Steps
+
+**Step 1: Check file sizes**
+```bash
+ls -lh {{ traces_folder }}/*.json | head -20
+```
+
+**Step 2: Categorize traces by outcome**
+```bash
+for f in {{ traces_folder }}/*.json; do
+    echo -n "$f: "
+    grep -oE '"reward":\s*[0-9.-]+' "$f" | head -1
+done 2>/dev/null | head -30
+```
+
+**Step 3: Examine sufficient traces**
+
+| Folder Size | Minimum Coverage |
+|-------------|------------------|
+| ≤ 10 traces | Examine ALL |
+| 11-30 traces | At least 70% |
+| > 30 traces | At least 50% |
+
+Balance coverage between successful and failed traces.
+
+**Step 4: Extract insights from differences**
+
+Focus on:
+- What do successful agents do that failed ones don't?
+- What mistakes do failed agents make?
+- What patterns appear across MULTIPLE traces (not one-off issues)?
+
+**Step 5: Place each insight where it belongs**
+
+For each insight:
+1. Identify which section of the original prompt it relates to
+2. Integrate it there — add a bullet, edit existing guidance, or insert a sentence
+3. Only create a new section if the original does not cover the topic
+
+Do NOT collect insights into a single "Learned Behaviors" appendix.
+
+**Step 6: Write the revised prompt to a file and submit**
+
+Write your final revised system prompt to `/tmp/optimized_system_prompt.txt` (plain text, the prompt body only — no YAML wrapper, no markdown fence):
+
+```bash
+cat > /tmp/optimized_system_prompt.txt << 'PROMPT_EOF'
+[your revised prompt here — single coherent document]
+PROMPT_EOF
+```
+
+Then call:
+```
+submit_optimized_params(file_path_dict={"system_prompt": "/tmp/optimized_system_prompt.txt"})
+```
+
+**Step 7: Self-review (structural preservation)**
+
+Re-read `/tmp/optimized_system_prompt.txt` and verify:
+
+- [ ] Every tool name from the original appears in the output
+- [ ] Every tool schema from the original appears character-for-character identical
+- [ ] Every output format specification appears character-for-character identical
+- [ ] Every few-shot example from the original appears character-for-character identical
+- [ ] No block marked "do not modify" / "required format" / "template" has been altered
+
+If any check fails, restore the missing/altered content from the original, rewrite the file, and re-submit via `submit_optimized_params`.
+
+**Step 8: Self-review (safety invariant)**
+
+Re-read the output and verify:
+
+- [ ] The revised prompt requires user confirmation before consequential actions (either preserved from original, or added if the original lacked it)
+- [ ] No line instructs the agent to skip, bypass, or avoid asking for confirmation
+- [ ] No line treats silence or ambiguity as approval
+- [ ] No line instructs the agent to act "immediately" / "autonomously" / "without confirmation" on behalf of the user (unless the original EXPLICITLY authorizes autonomous operation for a specific action)
+
+Remove any offending lines in-place, rewrite the file, and re-submit via `submit_optimized_params`.
+
+---
+
+## Criteria for Good Insights
+
+### Actionability
+Each integrated insight should imply:
+- A clear **trigger condition** (WHEN...)
+- A specific **action** (DO...)
+
+Reject vague insights like "be careful" or "consider edge cases".
+
+### Coverage
+- Address different failure modes observed across traces
+- Prioritize patterns that appear in multiple traces (not one-off issues)
+
+### Appropriate Generalization
+- Keep actions concrete and specific
+- Generalize trigger conditions only when the pattern applies broadly