Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
92 commits
Select commit Hold shift + click to select a range
9d8f149
kind of works
Nov 9, 2025
95c21c8
trace scoring
Nov 9, 2025
4036939
better api
Nov 10, 2025
d02704a
more changes
Nov 11, 2025
942b49c
better api
Nov 11, 2025
64e1070
Merge branch 'main' into alex/trace-in-scorer
Nov 20, 2025
e734a6e
make trace context flush before fetching
Nov 21, 2025
355cf55
Merge branch 'main' into alex/trace-in-scorer
Dec 1, 2025
fda9d29
rename
Dec 1, 2025
fcd8f91
jsdoc
Dec 1, 2025
f0f93af
Merge branch 'main' into alex/trace-in-scorer
Dec 15, 2025
b13ee06
cache v1
Dec 16, 2025
c0035f6
tmp file
Dec 16, 2025
77b26a9
flag
Dec 16, 2025
59ee9b8
bump vers to fix test
Dec 16, 2025
4c1bc79
major bump
Dec 16, 2025
2eeeb46
disable local cache
Dec 16, 2025
30e5217
otel support?
Dec 16, 2025
f67f9e8
turn off cache when otel is used
Dec 16, 2025
7807091
remove console.log
Dec 17, 2025
bc236ef
sensible new version
Dec 17, 2025
9e864e2
Merge branch 'main' into alex/trace-in-scorer
Dec 17, 2025
efea2cb
fix build
Dec 17, 2025
b5d117b
try to fix web builds
Dec 17, 2025
a34741d
don't pass trace to scoring args
Dec 17, 2025
59f6c7b
Merge branch 'main' into alex/trace-in-scorer
Dec 19, 2025
b1dc350
pass state into the trace object
Dec 26, 2025
f67aba5
get passed in state
Dec 26, 2025
dcc7dd7
make cache writes not block
Dec 26, 2025
7cf3a5a
remove trace re-export
Dec 26, 2025
0300f07
fix test
Dec 26, 2025
469fa20
extend object fetcher
Dec 26, 2025
5295cd4
refactors for objects
Dec 26, 2025
b5b3511
state argument for init function
Dec 29, 2025
c314138
forgot doc
Dec 29, 2025
8eb6d49
Merge branch 'main' into alex/trace-in-scorer
CLowbrow Dec 29, 2025
3ff1c5d
evaluator doesn't always have state
Dec 29, 2025
b252963
console
Dec 29, 2025
cd7d04e
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Dec 30, 2025
3b4653a
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Dec 31, 2025
56c5bf9
export trace
ankrgyl Dec 31, 2025
38dcd00
convert Trace to be an interface
ankrgyl Jan 1, 2026
087e2a2
export more stuff
ankrgyl Jan 1, 2026
ddccd08
fix
ankrgyl Jan 2, 2026
5ca164f
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 3, 2026
4c789a0
use span_attributes.purpsoe
ankrgyl Jan 3, 2026
ca8b861
fix
ankrgyl Jan 3, 2026
0fc8400
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 9, 2026
c5e43fc
only turn cache on for evals
Jan 10, 2026
863b5e6
get rid of syncwrite
Jan 12, 2026
927df14
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 12, 2026
7992b74
factor out cached span fetcher
ankrgyl Jan 13, 2026
e721910
spancache cleanup
Jan 13, 2026
98ae957
init cache in inner eval method
Jan 13, 2026
db9ad2a
handle parallell evals
Jan 13, 2026
763dab6
belt and suspenders
Jan 13, 2026
3fca8aa
remove logs
Jan 13, 2026
6324d80
use mergeDict
Jan 13, 2026
53e6380
cleanup
Jan 13, 2026
ebf9524
don't crash
Jan 14, 2026
91dcaa1
PYTHON
Jan 14, 2026
bc1f473
fix bundler test
cpinn Jan 14, 2026
f7a8beb
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 15, 2026
9cec760
Merge branch 'main' into alex/trace-in-scorer-python
ankrgyl Jan 15, 2026
d2226f7
ag fixes
ankrgyl Jan 14, 2026
d00c30c
fix invoke
ankrgyl Jan 14, 2026
a53b21a
snapshot
ankrgyl Jan 15, 2026
d0c375c
Revert "snapshot"
ankrgyl Jan 15, 2026
9eb327e
Revert "fix invoke"
ankrgyl Jan 15, 2026
c27c538
fix json serializability
ankrgyl Jan 15, 2026
25322d2
Merge branch 'alex/trace-in-scorer' into both-trace-scorers
ankrgyl Jan 15, 2026
62e8955
Merge branch 'alex/trace-in-scorer-python' into both-trace-scorers
ankrgyl Jan 15, 2026
c53ff50
rename config fields
Jan 15, 2026
e83fcb1
Merge branch 'main' into alex/trace-in-scorer
Jan 15, 2026
e43c61e
Merge branch 'caitlin/fix-bundler-tes' into alex/trace-in-scorer
Jan 15, 2026
f2ae38f
Merge branch 'main' into alex/trace-in-scorer
Jan 15, 2026
c505f2a
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 16, 2026
26e0658
Merge branch 'main' into alex/trace-in-scorer-python
ankrgyl Jan 16, 2026
61ba617
Merge branch 'alex/trace-in-scorer' into both-trace-scorers
ankrgyl Jan 16, 2026
28cfbe8
Merge branch 'alex/trace-in-scorer-python' into both-trace-scorers
ankrgyl Jan 16, 2026
1314573
wip otel stuff
Jan 16, 2026
67abadc
add a toJSON method
ankrgyl Jan 16, 2026
a6d1dd4
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 16, 2026
499e6fc
Merge branch 'main' into alex/trace-in-scorer-python
ankrgyl Jan 16, 2026
c630564
Merge branch 'alex/trace-in-scorer' into both-trace-scorers
ankrgyl Jan 16, 2026
d74224f
Merge branch 'alex/trace-in-scorer-python' into both-trace-scorers
ankrgyl Jan 16, 2026
5c18fe4
init-dataset-with-id (#1276)
ankrgyl Jan 16, 2026
e71bf48
Merge branch 'main' into alex/trace-in-scorer-python
ankrgyl Jan 17, 2026
b99f266
Merge branch 'main' into alex/trace-in-scorer
ankrgyl Jan 17, 2026
4245dae
Merge branch 'alex/trace-in-scorer' into both-trace-scorers
ankrgyl Jan 17, 2026
c62769e
Merge branch 'alex/trace-in-scorer-python' into both-trace-scorers
ankrgyl Jan 17, 2026
4a935ca
Merge branch 'main' into alex/trace-in-scorer-python
ankrgyl Jan 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 56 additions & 4 deletions py/examples/evals/eval_example.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,64 @@
import json

from braintrust import Eval

NUM_EXAMPLES = 10


def exact_match_scorer(input, output, expected):
if expected is None:
return 0.0
return 1.0 if output == expected else 0.0
async def exact_match_scorer(input, output, expected, trace=None):
"""Async scorer that prints trace spans."""
score = 0.0
if expected is not None:
score = 1.0 if output == expected else 0.0

if trace:
print("\n" + "="*80)
print(f"🔍 TRACE INFO for input: {input}")
print("="*80)

# Print trace configuration
config = trace.get_configuration()
print(f"\n📋 Configuration:")
print(f" Object Type: {config.get('objectType')}")
print(f" Object ID: {config.get('objectId')}")
print(f" Root Span: {config.get('rootSpanId')}")

# Fetch and print spans
try:
spans = await trace.get_spans()
print(f"\n✨ Found {len(spans)} spans:")
print("-"*80)

for i, span in enumerate(spans, 1):
print(f"\n Span {i}:")
print(f" ID: {span.span_id}")
span_type = span.span_attributes.get('type', 'N/A') if span.span_attributes else 'N/A'
span_name = span.span_attributes.get('name', 'N/A') if span.span_attributes else 'N/A'
print(f" Type: {span_type}")
print(f" Name: {span_name}")

if span.input:
input_str = json.dumps(span.input)
if len(input_str) > 100:
input_str = input_str[:100] + "..."
print(f" Input: {input_str}")
if span.output:
output_str = json.dumps(span.output)
if len(output_str) > 100:
output_str = output_str[:100] + "..."
print(f" Output: {output_str}")
if span.metadata:
print(f" Metadata: {list(span.metadata.keys())}")

print("\n" + "="*80 + "\n")
except Exception as e:
print(f"\n⚠️ Error fetching spans: {e}")
import traceback
traceback.print_exc()
else:
print(f"⚠️ No trace available for input: {input}")

return score


def data_fn():
Expand Down
79 changes: 78 additions & 1 deletion py/src/braintrust/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -1280,6 +1280,29 @@ async def _run_evaluator_internal(
filters: list[Filter],
stream: Callable[[SSEProgressEvent], None] | None = None,
state: BraintrustState | None = None,
):
# Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
if state is None:
from braintrust.logger import _internal_get_global_state

state = _internal_get_global_state()

state.span_cache.start()
try:
return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
finally:
# Clean up disk-based span cache after eval completes and stop caching
state.span_cache.dispose()
state.span_cache.stop()


async def _run_evaluator_internal_impl(
experiment,
evaluator: Evaluator,
position: int | None,
filters: list[Filter],
stream: Callable[[SSEProgressEvent], None] | None = None,
state: BraintrustState | None = None,
):
event_loop = asyncio.get_event_loop()

Expand All @@ -1290,11 +1313,13 @@ async def await_or_run_scorer(root_span, scorer, name, **kwargs):
{**parent_propagated},
{"span_attributes": {"purpose": "scorer"}},
)
# Strip trace from logged input - it's internal plumbing that shouldn't appear in spans
logged_input = {k: v for k, v in kwargs.items() if k != "trace"}
with root_span.start_span(
name=name,
span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
propagated_event=merged_propagated,
input=dict(**kwargs),
input=logged_input,
) as span:
score = scorer
if hasattr(scorer, "eval_async"):
Expand Down Expand Up @@ -1415,6 +1440,57 @@ def report_progress(event: TaskProgressEvent):
tags = hooks.tags if hooks.tags else None
root_span.log(output=output, metadata=metadata, tags=tags)

# Create trace object for scorers
from braintrust.trace import LocalTrace

async def ensure_spans_flushed():
# Flush native Braintrust spans
if experiment:
from braintrust.logger import flush as flush_logger

await asyncio.get_event_loop().run_in_executor(
None, lambda: flush_logger(state=experiment._state)
)
elif state:
from braintrust.logger import flush as flush_logger

await asyncio.get_event_loop().run_in_executor(None, lambda: flush_logger(state=state))
else:
from braintrust.logger import flush as flush_logger

await asyncio.get_event_loop().run_in_executor(None, flush_logger)

experiment_id = None
if experiment:
try:
experiment_id = experiment.id
except:
experiment_id = None

trace = None
if state or experiment:
# Get the state to use
trace_state = state
if not trace_state and experiment:
trace_state = experiment._state
if not trace_state:
# Fall back to global state
from braintrust.logger import _internal_get_global_state

trace_state = _internal_get_global_state()

# Access root_span_id from the concrete SpanImpl instance
# The Span interface doesn't expose this but SpanImpl has it
root_span_id_value = getattr(root_span, "root_span_id", root_span.id)

trace = LocalTrace(
object_type="experiment",
object_id=experiment_id or "",
root_span_id=root_span_id_value,
ensure_spans_flushed=ensure_spans_flushed,
state=trace_state,
)

score_promises = [
asyncio.create_task(
await_or_run_scorer(
Expand All @@ -1426,6 +1502,7 @@ def report_progress(event: TaskProgressEvent):
"expected": datum.expected,
"metadata": metadata,
"output": output,
"trace": trace,
},
)
)
Expand Down
4 changes: 3 additions & 1 deletion py/src/braintrust/functions/invoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from sseclient import SSEClient

from .._generated_types import FunctionTypeEnum
from ..logger import Exportable, get_span_parent_object, login, proxy_conn
from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
from ..util import response_raise_for_status
from .constants import INVOKE_API_VERSION
from .stream import BraintrustInvokeError, BraintrustStream
Expand Down Expand Up @@ -243,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None):
:param version: Optional version of the function to use. Defaults to latest.
:return: A function that can be used as a task or scorer.
"""
# Disable span cache since remote function spans won't be in the local cache
_internal_get_global_state().span_cache.disable()

def f(*args: Any, **kwargs: Any) -> Any:
if len(args) > 0:
Expand Down
61 changes: 61 additions & 0 deletions py/src/braintrust/functions/test_invoke.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Tests for the invoke module, particularly init_function."""


from braintrust.functions.invoke import init_function
from braintrust.logger import _internal_get_global_state, _internal_reset_global_state


class TestInitFunction:
"""Tests for init_function."""

def setup_method(self):
"""Reset state before each test."""
_internal_reset_global_state()

def teardown_method(self):
"""Clean up after each test."""
_internal_reset_global_state()

def test_init_function_disables_span_cache(self):
"""Test that init_function disables the span cache."""
state = _internal_get_global_state()

# Cache should be disabled by default (it's only enabled during evals)
assert state.span_cache.disabled is True

# Enable the cache (simulating what happens during eval)
state.span_cache.start()
assert state.span_cache.disabled is False

# Call init_function
f = init_function("test-project", "test-function")

# Cache should now be disabled (init_function explicitly disables it)
assert state.span_cache.disabled is True
assert f.__name__ == "init_function-test-project-test-function-latest"

def test_init_function_with_version(self):
"""Test that init_function creates a function with the correct name including version."""
f = init_function("my-project", "my-scorer", version="v1")
assert f.__name__ == "init_function-my-project-my-scorer-v1"

def test_init_function_without_version_uses_latest(self):
"""Test that init_function uses 'latest' in name when version not specified."""
f = init_function("my-project", "my-scorer")
assert f.__name__ == "init_function-my-project-my-scorer-latest"

def test_init_function_permanently_disables_cache(self):
"""Test that init_function permanently disables the cache (can't be re-enabled)."""
state = _internal_get_global_state()

# Enable the cache
state.span_cache.start()
assert state.span_cache.disabled is False

# Call init_function
init_function("test-project", "test-function")
assert state.span_cache.disabled is True

# Try to start again - should still be disabled because of explicit disable
state.span_cache.start()
assert state.span_cache.disabled is True
54 changes: 54 additions & 0 deletions py/src/braintrust/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,11 @@ def default_get_api_conn():
),
)

from braintrust.span_cache import SpanCache

self.span_cache = SpanCache()
self._otel_flush_callback: Any | None = None

def reset_login_info(self):
self.app_url: str | None = None
self.app_public_url: str | None = None
Expand Down Expand Up @@ -457,6 +462,21 @@ def context_manager(self):

return self._context_manager

def register_otel_flush(self, callback: Any) -> None:
"""
Register an OTEL flush callback. This is called by the OTEL integration
when it initializes a span processor/exporter.
"""
self._otel_flush_callback = callback

async def flush_otel(self) -> None:
"""
Flush OTEL spans if a callback is registered.
Called during ensure_spans_flushed to ensure OTEL spans are visible in BTQL.
"""
if self._otel_flush_callback:
await self._otel_flush_callback()

def copy_state(self, other: "BraintrustState"):
"""Copy login information from another BraintrustState instance."""
self.__dict__.update({
Expand Down Expand Up @@ -1777,6 +1797,25 @@ def login(
_state.login(app_url=app_url, api_key=api_key, org_name=org_name, force_login=force_login)


def register_otel_flush(callback: Any) -> None:
"""
Register a callback to flush OTEL spans. This is called by the OTEL integration
when it initializes a span processor/exporter.

When ensure_spans_flushed is called (e.g., before a BTQL query in scorers),
this callback will be invoked to ensure OTEL spans are flushed to the server.

Also disables the span cache, since OTEL spans aren't in the local cache
and we need BTQL to see the complete span tree (both native + OTEL spans).

:param callback: The async callback function to flush OTEL spans.
"""
global _state
_state.register_otel_flush(callback)
# Disable span cache since OTEL spans aren't in the local cache
_state.span_cache.disable()


def login_to_state(
app_url: str | None = None,
api_key: str | None = None,
Expand Down Expand Up @@ -3847,6 +3886,21 @@ def log_internal(self, event: dict[str, Any] | None = None, internal_data: dict[
if serializable_partial_record.get("metrics", {}).get("end") is not None:
self._logged_end_time = serializable_partial_record["metrics"]["end"]

# Write to local span cache for scorer access
# Only cache experiment spans - regular logs don't need caching
if self.parent_object_type == SpanObjectTypeV3.EXPERIMENT:
from braintrust.span_cache import CachedSpan

cached_span = CachedSpan(
span_id=self.span_id,
input=serializable_partial_record.get("input"),
output=serializable_partial_record.get("output"),
metadata=serializable_partial_record.get("metadata"),
span_parents=self.span_parents,
span_attributes=serializable_partial_record.get("span_attributes"),
)
self.state.span_cache.queue_write(self.root_span_id, self.span_id, cached_span)

def compute_record() -> dict[str, Any]:
exporter = _get_exporter()
return dict(
Expand Down
Loading
Loading