braintrustdata · CLowbrow · Nov 9, 2025 · Nov 9, 2025 · Nov 10, 2025 · Nov 11, 2025
diff --git a/py/examples/evals/eval_example.py b/py/examples/evals/eval_example.py
@@ -1,12 +1,64 @@
+import json
+
 from braintrust import Eval
 
 NUM_EXAMPLES = 10
 
 
-def exact_match_scorer(input, output, expected):
-    if expected is None:
-        return 0.0
-    return 1.0 if output == expected else 0.0
+async def exact_match_scorer(input, output, expected, trace=None):
+    """Async scorer that prints trace spans."""
+    score = 0.0
+    if expected is not None:
+        score = 1.0 if output == expected else 0.0
+
+    if trace:
+        print("\n" + "="*80)
+        print(f"🔍 TRACE INFO for input: {input}")
+        print("="*80)
+
+        # Print trace configuration
+        config = trace.get_configuration()
+        print(f"\n📋 Configuration:")
+        print(f"  Object Type: {config.get('objectType')}")
+        print(f"  Object ID:   {config.get('objectId')}")
+        print(f"  Root Span:   {config.get('rootSpanId')}")
+
+        # Fetch and print spans
+        try:
+            spans = await trace.get_spans()
+            print(f"\n✨ Found {len(spans)} spans:")
+            print("-"*80)
+
+            for i, span in enumerate(spans, 1):
+                print(f"\n  Span {i}:")
+                print(f"    ID:         {span.span_id}")
+                span_type = span.span_attributes.get('type', 'N/A') if span.span_attributes else 'N/A'
+                span_name = span.span_attributes.get('name', 'N/A') if span.span_attributes else 'N/A'
+                print(f"    Type:       {span_type}")
+                print(f"    Name:       {span_name}")
+
+                if span.input:
+                    input_str = json.dumps(span.input)
+                    if len(input_str) > 100:
+                        input_str = input_str[:100] + "..."
+                    print(f"    Input:      {input_str}")
+                if span.output:
+                    output_str = json.dumps(span.output)
+                    if len(output_str) > 100:
+                        output_str = output_str[:100] + "..."
+                    print(f"    Output:     {output_str}")
+                if span.metadata:
+                    print(f"    Metadata:   {list(span.metadata.keys())}")
+
+            print("\n" + "="*80 + "\n")
+        except Exception as e:
+            print(f"\n⚠️  Error fetching spans: {e}")
+            import traceback
+            traceback.print_exc()
+    else:
+        print(f"⚠️  No trace available for input: {input}")
+
+    return score
 
 
 def data_fn():

diff --git a/py/src/braintrust/framework.py b/py/src/braintrust/framework.py
@@ -1280,6 +1280,29 @@ async def _run_evaluator_internal(
     filters: list[Filter],
     stream: Callable[[SSEProgressEvent], None] | None = None,
     state: BraintrustState | None = None,
+):
+    # Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
+    if state is None:
+        from braintrust.logger import _internal_get_global_state
+
+        state = _internal_get_global_state()
+
+    state.span_cache.start()
+    try:
+        return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
+    finally:
+        # Clean up disk-based span cache after eval completes and stop caching
+        state.span_cache.dispose()
+        state.span_cache.stop()
+
+
+async def _run_evaluator_internal_impl(
+    experiment,
+    evaluator: Evaluator,
+    position: int | None,
+    filters: list[Filter],
+    stream: Callable[[SSEProgressEvent], None] | None = None,
+    state: BraintrustState | None = None,
 ):
     event_loop = asyncio.get_event_loop()
 
@@ -1290,11 +1313,13 @@ async def await_or_run_scorer(root_span, scorer, name, **kwargs):
             {**parent_propagated},
             {"span_attributes": {"purpose": "scorer"}},
         )
+        # Strip trace from logged input - it's internal plumbing that shouldn't appear in spans
+        logged_input = {k: v for k, v in kwargs.items() if k != "trace"}
         with root_span.start_span(
             name=name,
             span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
             propagated_event=merged_propagated,
-            input=dict(**kwargs),
+            input=logged_input,
         ) as span:
             score = scorer
             if hasattr(scorer, "eval_async"):
@@ -1415,6 +1440,57 @@ def report_progress(event: TaskProgressEvent):
                 tags = hooks.tags if hooks.tags else None
                 root_span.log(output=output, metadata=metadata, tags=tags)
 
+                # Create trace object for scorers
+                from braintrust.trace import LocalTrace
+
+                async def ensure_spans_flushed():
+                    # Flush native Braintrust spans
+                    if experiment:
+                        from braintrust.logger import flush as flush_logger
+
+                        await asyncio.get_event_loop().run_in_executor(
+                            None, lambda: flush_logger(state=experiment._state)
+                        )
+                    elif state:
+                        from braintrust.logger import flush as flush_logger
+
+                        await asyncio.get_event_loop().run_in_executor(None, lambda: flush_logger(state=state))
+                    else:
+                        from braintrust.logger import flush as flush_logger
+
+                        await asyncio.get_event_loop().run_in_executor(None, flush_logger)
+
+                experiment_id = None
+                if experiment:
+                    try:
+                        experiment_id = experiment.id
+                    except:
+                        experiment_id = None
+
+                trace = None
+                if state or experiment:
+                    # Get the state to use
+                    trace_state = state
+                    if not trace_state and experiment:
+                        trace_state = experiment._state
+                    if not trace_state:
+                        # Fall back to global state
+                        from braintrust.logger import _internal_get_global_state
+
+                        trace_state = _internal_get_global_state()
+
+                    # Access root_span_id from the concrete SpanImpl instance
+                    # The Span interface doesn't expose this but SpanImpl has it
+                    root_span_id_value = getattr(root_span, "root_span_id", root_span.id)
+
+                    trace = LocalTrace(
+                        object_type="experiment",
+                        object_id=experiment_id or "",
+                        root_span_id=root_span_id_value,
+                        ensure_spans_flushed=ensure_spans_flushed,
+                        state=trace_state,
+                    )
+
                 score_promises = [
                     asyncio.create_task(
                         await_or_run_scorer(
@@ -1426,6 +1502,7 @@ def report_progress(event: TaskProgressEvent):
                                 "expected": datum.expected,
                                 "metadata": metadata,
                                 "output": output,
+                                "trace": trace,
                             },
                         )
                     )

diff --git a/py/src/braintrust/functions/invoke.py b/py/src/braintrust/functions/invoke.py
@@ -3,7 +3,7 @@
 from sseclient import SSEClient
 
 from .._generated_types import FunctionTypeEnum
-from ..logger import Exportable, get_span_parent_object, login, proxy_conn
+from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
 from ..util import response_raise_for_status
 from .constants import INVOKE_API_VERSION
 from .stream import BraintrustInvokeError, BraintrustStream
@@ -243,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None):
     :param version: Optional version of the function to use. Defaults to latest.
     :return: A function that can be used as a task or scorer.
     """
+    # Disable span cache since remote function spans won't be in the local cache
+    _internal_get_global_state().span_cache.disable()
 
     def f(*args: Any, **kwargs: Any) -> Any:
         if len(args) > 0:

diff --git a/py/src/braintrust/functions/test_invoke.py b/py/src/braintrust/functions/test_invoke.py
@@ -0,0 +1,61 @@
+"""Tests for the invoke module, particularly init_function."""
+
+
+from braintrust.functions.invoke import init_function
+from braintrust.logger import _internal_get_global_state, _internal_reset_global_state
+
+
+class TestInitFunction:
+    """Tests for init_function."""
+
+    def setup_method(self):
+        """Reset state before each test."""
+        _internal_reset_global_state()
+
+    def teardown_method(self):
+        """Clean up after each test."""
+        _internal_reset_global_state()
+
+    def test_init_function_disables_span_cache(self):
+        """Test that init_function disables the span cache."""
+        state = _internal_get_global_state()
+
+        # Cache should be disabled by default (it's only enabled during evals)
+        assert state.span_cache.disabled is True
+
+        # Enable the cache (simulating what happens during eval)
+        state.span_cache.start()
+        assert state.span_cache.disabled is False
+
+        # Call init_function
+        f = init_function("test-project", "test-function")
+
+        # Cache should now be disabled (init_function explicitly disables it)
+        assert state.span_cache.disabled is True
+        assert f.__name__ == "init_function-test-project-test-function-latest"
+
+    def test_init_function_with_version(self):
+        """Test that init_function creates a function with the correct name including version."""
+        f = init_function("my-project", "my-scorer", version="v1")
+        assert f.__name__ == "init_function-my-project-my-scorer-v1"
+
+    def test_init_function_without_version_uses_latest(self):
+        """Test that init_function uses 'latest' in name when version not specified."""
+        f = init_function("my-project", "my-scorer")
+        assert f.__name__ == "init_function-my-project-my-scorer-latest"
+
+    def test_init_function_permanently_disables_cache(self):
+        """Test that init_function permanently disables the cache (can't be re-enabled)."""
+        state = _internal_get_global_state()
+
+        # Enable the cache
+        state.span_cache.start()
+        assert state.span_cache.disabled is False
+
+        # Call init_function
+        init_function("test-project", "test-function")
+        assert state.span_cache.disabled is True
+
+        # Try to start again - should still be disabled because of explicit disable
+        state.span_cache.start()
+        assert state.span_cache.disabled is True
diff --git a/py/src/braintrust/logger.py b/py/src/braintrust/logger.py
@@ -401,6 +401,11 @@ def default_get_api_conn():
             ),
         )
 
+        from braintrust.span_cache import SpanCache
+
+        self.span_cache = SpanCache()
+        self._otel_flush_callback: Any | None = None
+
     def reset_login_info(self):
         self.app_url: str | None = None
         self.app_public_url: str | None = None
@@ -457,6 +462,21 @@ def context_manager(self):
 
         return self._context_manager
 
+    def register_otel_flush(self, callback: Any) -> None:
+        """
+        Register an OTEL flush callback. This is called by the OTEL integration
+        when it initializes a span processor/exporter.
+        """
+        self._otel_flush_callback = callback
+
+    async def flush_otel(self) -> None:
+        """
+        Flush OTEL spans if a callback is registered.
+        Called during ensure_spans_flushed to ensure OTEL spans are visible in BTQL.
+        """
+        if self._otel_flush_callback:
+            await self._otel_flush_callback()
+
     def copy_state(self, other: "BraintrustState"):
         """Copy login information from another BraintrustState instance."""
         self.__dict__.update({
@@ -1777,6 +1797,25 @@ def login(
         _state.login(app_url=app_url, api_key=api_key, org_name=org_name, force_login=force_login)
 
 
+def register_otel_flush(callback: Any) -> None:
+    """
+    Register a callback to flush OTEL spans. This is called by the OTEL integration
+    when it initializes a span processor/exporter.
+
+    When ensure_spans_flushed is called (e.g., before a BTQL query in scorers),
+    this callback will be invoked to ensure OTEL spans are flushed to the server.
+
+    Also disables the span cache, since OTEL spans aren't in the local cache
+    and we need BTQL to see the complete span tree (both native + OTEL spans).
+
+    :param callback: The async callback function to flush OTEL spans.
+    """
+    global _state
+    _state.register_otel_flush(callback)
+    # Disable span cache since OTEL spans aren't in the local cache
+    _state.span_cache.disable()
+
+
 def login_to_state(
     app_url: str | None = None,
     api_key: str | None = None,
@@ -3847,6 +3886,21 @@ def log_internal(self, event: dict[str, Any] | None = None, internal_data: dict[
         if serializable_partial_record.get("metrics", {}).get("end") is not None:
             self._logged_end_time = serializable_partial_record["metrics"]["end"]
 
+        # Write to local span cache for scorer access
+        # Only cache experiment spans - regular logs don't need caching
+        if self.parent_object_type == SpanObjectTypeV3.EXPERIMENT:
+            from braintrust.span_cache import CachedSpan
+
+            cached_span = CachedSpan(
+                span_id=self.span_id,
+                input=serializable_partial_record.get("input"),
+                output=serializable_partial_record.get("output"),
+                metadata=serializable_partial_record.get("metadata"),
+                span_parents=self.span_parents,
+                span_attributes=serializable_partial_record.get("span_attributes"),
+            )
+            self.state.span_cache.queue_write(self.root_span_id, self.span_id, cached_span)
+
         def compute_record() -> dict[str, Any]:
             exporter = _get_exporter()
             return dict(