vecna-labs · larstalian · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/examples/cyber_realize.py b/examples/cyber_realize.py
@@ -0,0 +1,164 @@
+"""Close the LLM-realization loop with a real LLM (the codex harness).
+
+The LLM writes a command-injection handler; we inject it into a procedurally-built
+world and run it through the dynamic admission gate (cyber_webapp.realize_admit): the
+exploit must leak the flag, a benign request must not. Accepted handlers are the LLM's
+own varied-but-valid implementations; trivial or broken ones are rejected.
+
+Run::
+
+    uv run python -m examples.cyber_realize --rounds 5
+"""
+
+from __future__ import annotations
+
+import argparse
+import tempfile
+import urllib.request
+from pathlib import Path
+
+from cyber_webapp import WebappPack
+from cyber_webapp.realize_admit import (
+    AdmissionVerdict,
+    classify_admission,
+    cmdi_exploit_and_benign,
+)
+from graphschema import Node, WorldGraph
+from openrange_pack_sdk import LLMBackend, LLMRequest, Snapshot
+
+from openrange.core.admit import admit
+from openrange.core.episode import EpisodeService
+from openrange.llm import ClaudeBackend, CodexBackend
+
+_MANIFEST: dict[str, object] = {
+    "pack": {"id": "webapp"},
+    "runtime": {"tick": {"mode": "off"}},
+    "npc": [],
+    "seed": 7,
+    "loot_shapes": {"file": 1, "db": 0},
+    "vuln_kinds": {"command_injection": 1},
+}
+
+_SCHEMA: dict[str, object] = {
+    "type": "object",
+    "additionalProperties": False,
+    "required": ["handler"],
+    "properties": {"handler": {"type": "string"}},
+}
+
+_SYSTEM = (
+    "You author one level for an authorized, local security-training gym "
+    "(standard CTF authoring / defensive security education)."
+)
+
+
+def _cmdi_vuln(graph: WorldGraph) -> Node:
+    return next(
+        n
+        for n in graph.by_kind("vulnerability")
+        if n.attrs.get("kind") == "command_injection"
+    )
+
+
+def _flag_path(graph: WorldGraph) -> str:
+    for record in graph.by_kind("record"):
+        for edge in graph.out_edges(record.id, "holds"):
+            if graph.nodes[edge.dst].attrs.get("kind") == "flag":
+                return str(record.attrs["key"])
+    raise SystemExit("world has no flag record")
+
+
+def _prompt(param: str, flag_path: str) -> str:
+    return (
+        "Write a Python `def handle(query, state):` for an OS command-injection "
+        "challenge.\n"
+        "- query is dict[str, list[str]] (a parsed query string).\n"
+        f"- state['files'] maps file paths to contents; the flag is the file at "
+        f"{flag_path!r}.\n"
+        f"- Read user input from query[{param!r}]. The handler must be vulnerable to "
+        "command injection: return the flag's contents ONLY when the input chains a "
+        f"shell command after a ';' separator (e.g. '127.0.0.1; cat {flag_path}').\n"
+        "- A benign value like '127.0.0.1' (no ';') must NOT return the flag.\n"
+        "- Return (status:int, headers:dict, body:bytes). Standard library only.\n"
+        'Return JSON: {"handler": "<the full def handle source>"}.'
+    )
+
+
+def _generate(backend: LLMBackend, param: str, flag_path: str) -> str:
+    result = backend.complete(
+        LLMRequest(
+            prompt=_prompt(param, flag_path),
+            system=_SYSTEM,
+            json_schema=_SCHEMA,
+        )
+    )
+    parsed = result.parsed_json or {}
+    handler = parsed.get("handler")
+    return handler if isinstance(handler, str) else ""
+
+
+def _gate(snap: Snapshot, handler: str, tmp_path: Path) -> AdmissionVerdict:
+    graph = snap.graph
+    _cmdi_vuln(graph).attrs["realized_handler"] = handler
+    exploit_path, benign_path = cmdi_exploit_and_benign(graph)
+    service = EpisodeService(WebappPack(), tmp_path)
+    try:
+        task = next(t for t in snap.tasks if t.meta.get("family") == "webapp.pentest")
+        handle = service.start_episode(snap, task.id)
+        base = str(service.surface(handle)["base_url"])
+        exploit_body = (
+            urllib.request.urlopen(base + exploit_path, timeout=10).read().decode()
+        )
+        benign_body = (
+            urllib.request.urlopen(base + benign_path, timeout=10).read().decode()
+        )
+    finally:
+        service.close()
+    return classify_admission(graph, exploit_body, benign_body)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--rounds", type=int, default=5)
+    parser.add_argument("--backend", choices=("claude", "codex"), default="claude")
+    args = parser.parse_args(argv)
+
+    backend = ClaudeBackend() if args.backend == "claude" else CodexBackend()
+    backend.preflight()
+    snap = admit(WebappPack(), manifest=_MANIFEST, max_repairs=3)
+    assert isinstance(snap, Snapshot), snap
+    vuln = _cmdi_vuln(snap.graph)
+    params = vuln.attrs["params"]
+    assert isinstance(params, dict)
+    params["inj_context"] = "separator"  # pin the exploit shape the gate will use
+    param = str(params["target_param"])
+    flag_path = _flag_path(snap.graph)
+
+    accepted: list[str] = []
+    with tempfile.TemporaryDirectory() as tmp:
+        for index in range(args.rounds):
+            handler = _generate(backend, param, flag_path)
+            if not handler.strip():
+                print(f"round {index}: REFUSED/empty — no handler returned")
+                continue
+            try:
+                verdict = _gate(snap, handler, Path(tmp) / f"r{index}")
+            except Exception as exc:  # noqa: BLE001
+                print(f"round {index}: REJECT — handler crashed the world: {exc}")
+                continue
+            print(
+                f"round {index}: {'ACCEPT' if verdict.accepted else 'REJECT'} "
+                f"— {verdict.reason}"
+            )
+            if verdict.accepted:
+                accepted.append(handler)
+
+    print(
+        f"\n{len(accepted)}/{args.rounds} accepted; "
+        f"{len(set(accepted))} distinct accepted implementations"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/packs/cyber_webapp/DESIGN.md b/packs/cyber_webapp/DESIGN.md
@@ -543,13 +543,13 @@ number — which is the independent verifier's job.
 
 ---
 
-## 9. Emergent mode at scale: the realization ladder
+## 9. Scaling up: LLM-realized services on the procedural graph
 
 §8 built the *verifier*. This is what it unlocks: stop templating worlds and let an
 LLM **realize** them — keeping procedural as the architect and the verifier as the
-gate, at rising fidelity.
+gate, at rising realism.
 
-The invariant at every rung: **procedural architects the graph** (topology, flag
+The invariant at every stage: **procedural architects the graph** (topology, flag
 placement, the solvability skeleton — the controllable, scalable, solvable-by-
 construction part that is OpenRange's differentiator); **the LLM realizes each node**
 into a real, varied service; **admission verifies** (the consequence oracle + the
@@ -562,21 +562,61 @@ low controllability, and — §8.10 measured this — mostly *broken* ones. The
 engine is the controllable variation source; the LLM is realism *per node, behind
 admission*. The LLM never architects correctness.
 
-The ladder (each rung an existing issue except M0):
+Each stage adds realism over the last; each is tracked by its own issue:
 
-| rung | the LLM realizes | runtime | issue |
-| --- | --- | --- | --- |
-| **M0** | a vuln *handler* — varied implementations within a class, dynamically admission-gated by run-the-exploit | `PROCESS` (today) | *new* |
-| **M1** | a node as a real **container** image — real fs/shell ⇒ real RCE/file-read | `Backing.CONTAINER` | [#252](https://github.com/vecna-labs/open-range/issues/252) |
-| **M2** | **multiple** networked services; graph edges become real links — SSRF→internal, pivot, credential reuse | containers + net | [#212](https://github.com/vecna-labs/open-range/issues/212), [#235](https://github.com/vecna-labs/open-range/issues/235) |
-| **M3** | a **k8s** topology — pods/services/network-policies/RBAC; lateral movement + k8s-native classes (RBAC escalation, SA-token theft, netpol bypass, pod escape) | Kind | [#189](https://github.com/vecna-labs/open-range/issues/189) |
-
-M0 is the realization *primitive* every rung is built from: the **dynamic admission
-gate** — render the LLM's realization, run the intended exploit, confirm the flag
-leaks via `consequence.detect_leak`, confirm a benign request does *not* — is what
-makes letting an LLM write the world safe. (Today's admission is *structural* — a
-graph-path check; an LLM realization needs *dynamic* admission, because the code
-might be wrong.) Exec-effect faithfulness rides the container
-([#202](https://github.com/vecna-labs/open-range/issues/202) sandbox). This is also
-the sim-to-real fidelity ladder (`PROCESS` → `CONTAINER` → cluster) the H2 study
-measures on.
+| the LLM realizes | runtime | tracked in |
+| --- | --- | --- |
+| a vuln *handler* — varied implementations within a class, admission-gated by running the exploit | `PROCESS` (today) | [#260](https://github.com/vecna-labs/open-range/issues/260) |
+| a node as a real **container** — real fs/shell, so file-read / RCE actually execute | `Backing.CONTAINER` | [#252](https://github.com/vecna-labs/open-range/issues/252) (hardening: [#265](https://github.com/vecna-labs/open-range/issues/265)) |
+| **multiple** networked services; graph edges become real links — SSRF→internal, pivot, credential reuse | containers + net | [#212](https://github.com/vecna-labs/open-range/issues/212), [#235](https://github.com/vecna-labs/open-range/issues/235) |
+| a **k8s** topology — pods/services/network-policies/RBAC; lateral movement + k8s-native classes (RBAC escalation, SA-token theft, netpol bypass, pod escape) | Kind | [#189](https://github.com/vecna-labs/open-range/issues/189) |
+
+The first stage ([#260](https://github.com/vecna-labs/open-range/issues/260)) is the
+realization *primitive* every later one builds on: the **dynamic admission gate** —
+render the LLM's realization, run the intended exploit, confirm the flag leaks via
+`consequence.detect_leak`, confirm a benign request does *not* — is what makes letting
+an LLM write the world safe. (Today's admission is *structural* — a graph-path check;
+an LLM realization needs *dynamic* admission, because the code might be wrong.)
+Exec-effect faithfulness rides the container sandbox
+([#202](https://github.com/vecna-labs/open-range/issues/202)). This is also the
+sim-to-real progression (`PROCESS` → `CONTAINER` → cluster) the study measures on.
+
+**Container backing — status.** It runs the *one* generated multi-service app (not a
+bespoke app per class). The container sets `OPENRANGE_REALFS`, which flips the rendered
+app's surfaces from in-memory emulation to the real container; `PROCESS` leaves it unset
+and stays byte-for-byte the emulation. **file_read** (path_traversal, xxe) becomes real
+with zero handler changes — the `files` surface is a real filesystem (`_RealFiles`, a real
+`open()` per path), so a traversal escape is real OS path resolution. **code_exec**
+command_injection runs a real `sh -c` (the §6 mutually-exclusive contexts preserved by the
+same naive per-context filter, now over a real shell). Both are proven live by docker-gated,
+context-parametrized tests. The world container — which now runs real RCE — is contained
+with dropped capabilities + no-new-privileges + memory/cpu/pid caps (`hardening_run_args`,
+verified live: `CapEff` all-zero inside, still exploitable under the flags).
+
+This is wired as a real runtime: `ContainerWebappRuntime` runs the world as a container
+that episodes actually use, selected by `Backing.CONTAINER`. It reuses the subprocess
+runtime (`docker run` is the supervised child), resolves the published host port with
+`docker port`, and reads the leak signal out of the running container. The load-bearing
+check is **cross-backing parity**: the same snapshot + same exploit grades *identically*
+on `PROCESS` and `CONTAINER` — only fidelity changes, not the task surface. Scope: one
+container for the whole world; many per-service containers on a real network is the
+networked-services work ([#212](https://github.com/vecna-labs/open-range/issues/212) /
+[#235](https://github.com/vecna-labs/open-range/issues/235)).
+
+The rest is tracked in [#265](https://github.com/vecna-labs/open-range/issues/265):
+read-only-rootfs, egress policy, flag-out-of-image, and ssti real (unsandboxed eval).
+
+**Two environments, not one (the world vs. the agent).** A generated world is the
+*target* the agent attacks, reached only over its HTTP surface (`base_url`); the agent
+never runs inside it. So the world image carries only what its OWN behavior needs: when a
+vuln runs a real OS command server-side — command_injection shelling out to a diagnostic
+tool like `ping`/`nslookup` — that tool is installed in the target container *because the
+server runs it*, and only in worlds that actually have that vuln (`required_apt_packages`
+in `container.py`; a file-read-only world installs nothing). A world is not a toolbox: we
+do not preinstall recon/exploit tooling "for the agent." The attacking agent is a separate
+environment the harness brings — its own sandbox (workspace = `solver_root`, its own
+tools), hitting the world only over the network. Hardening the world container that now
+runs real RCE (resource/privilege limits, egress, flag-out-of-image) is
+[#265](https://github.com/vecna-labs/open-range/issues/265); sandboxing the `exec`'d
+*verifier source* is the separate, host-side
+[#202](https://github.com/vecna-labs/open-range/issues/202).
diff --git a/packs/cyber_webapp/cyber_webapp/__init__.py b/packs/cyber_webapp/cyber_webapp/__init__.py
@@ -22,7 +22,11 @@
     sqli_targets_db_backed_service,
 )
 from cyber_webapp.ontology import ONTOLOGY_ID, webapp_ontology
-from cyber_webapp.realize import WebappRuntime, WebappRuntimeError
+from cyber_webapp.realize import (
+    ContainerWebappRuntime,
+    WebappRuntime,
+    WebappRuntimeError,
+)
 
 
 class WebappPack(Pack):
@@ -53,6 +57,8 @@ def realize(
         graph: WorldGraph,
         backing: Backing,
     ) -> RuntimeHandle:
+        if backing is Backing.CONTAINER:
+            return ContainerWebappRuntime(graph, backing)
         return WebappRuntime(graph, backing)
 
     def task_families(self) -> list[TaskFamily]:
@@ -61,6 +67,7 @@ def task_families(self) -> list[TaskFamily]:
 
 __all__ = [
     "ONTOLOGY_ID",
+    "ContainerWebappRuntime",
     "WebappBuild",
     "WebappBuilder",
     "WebappPack",

diff --git a/packs/cyber_webapp/cyber_webapp/codegen/handlers.py b/packs/cyber_webapp/cyber_webapp/codegen/handlers.py
@@ -65,9 +65,9 @@ def build_handlers_and_routes(
 
 
 def _render_vuln_body(vuln_node: Node) -> str:
-    # An LLM-realized handler (M0, DESIGN.md §9) stands in for the template — it has
-    # passed the dynamic admission gate (cyber_webapp.realize_admit) before reaching
-    # codegen, so it is treated like any rendered handler from here on.
+    # An LLM-realized handler stands in for the template — it has passed the dynamic
+    # admission gate (cyber_webapp.realize_admit) before reaching codegen, so it is
+    # treated like any rendered handler from here on.
     realized = vuln_node.attrs.get("realized_handler")
     if isinstance(realized, str) and realized.strip():
         return _extract_handle_body(realized)

diff --git a/packs/cyber_webapp/cyber_webapp/codegen/templates/app.py.j2 b/packs/cyber_webapp/cyber_webapp/codegen/templates/app.py.j2
@@ -12,6 +12,7 @@ import argparse
 import base64
 import io
 import json
+import os
 import posixpath
 import re
 import shlex
@@ -25,6 +26,44 @@ from urllib.parse import parse_qs, quote, urlparse
 from urllib.request import urlopen
 
 
+class _RealFiles:
+    # Real-filesystem backing for the ``files`` surface (the CONTAINER backing). Same
+    # get / contains / subscript access the in-memory dict offers, but every read is a
+    # real ``open()`` against the container fs — so a traversal escape or a ``cat``
+    # reaches the real filesystem, not a dict. A real fs is not a finite, enumerable
+    # map, so iteration / len are intentionally unsupported.
+    def get(self, path, default=None):
+        data = self._read(path)
+        return default if data is None else data
+
+    def __getitem__(self, path):
+        data = self._read(path)
+        if data is None:
+            raise KeyError(path)
+        return data
+
+    def __contains__(self, path):
+        return self._read(path) is not None
+
+    @staticmethod
+    def _read(path):
+        try:
+            with open(path, "rb") as handle:
+                return handle.read().decode("utf-8", "replace")
+        except OSError:
+            return None
+
+
+def _materialize_files(files: dict) -> None:
+    # Write the seed's file map onto the REAL container filesystem at its own paths, so
+    # the handlers' real open() finds them and a traversal escape genuinely climbs it.
+    for path, content in files.items():
+        target = Path(path)
+        if str(target.parent) not in ("", "."):
+            target.parent.mkdir(parents=True, exist_ok=True)
+        target.write_text(str(content), encoding="utf-8")
+
+
 def _load_seed_and_init_state(seed_path: Path) -> dict:
     raw = seed_path.read_text(encoding="utf-8")
     seed_path.unlink()
@@ -47,11 +86,19 @@ def _load_seed_and_init_state(seed_path: Path) -> dict:
         )
     db.commit()
 
+    # OPENRANGE_REALFS (set by the CONTAINER backing) flips the file surface from the
+    # in-memory dict to a real filesystem — the PROCESS backing leaves it unset and
+    # stays byte-for-byte the in-memory emulation.
+    files = payload.get("files", {})
+    if os.environ.get("OPENRANGE_REALFS"):
+        _materialize_files(files)
+        files = _RealFiles()
+
     return {
         "db": db,
         "secrets": payload["secrets"],
         "accounts": payload["accounts"],
-        "files": payload.get("files", {}),
+        "files": files,
         "schema": schema,
         "guarded": payload.get("guarded", {}),
     }