Skip to content

chore(llmobs): submit experiment spans to experiment scope #14056

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ddtrace/llmobs/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,4 @@
PROXY_REQUEST = "llmobs.proxy_request"

EXPERIMENT_ID_KEY = "_ml_obs.experiment_id"
EXPERIMENT_EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output"
2 changes: 2 additions & 0 deletions ddtrace/llmobs/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from ddtrace.constants import ERROR_STACK
from ddtrace.constants import ERROR_TYPE
from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT


if TYPE_CHECKING:
Expand Down Expand Up @@ -262,6 +263,7 @@ def _process_record(self, idx_record: Tuple[int, DatasetRecord]) -> Optional[Tas
except Exception:
span.set_exc_info(*sys.exc_info())
self._llmobs_instance.annotate(span, input_data=input_data, output_data=output_data, tags=tags)
span._set_ctx_item(EXPERIMENT_EXPECTED_OUTPUT, record["expected_output"])
return {
"idx": idx,
"span_id": span_id,
Expand Down
18 changes: 13 additions & 5 deletions ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from ddtrace.llmobs._constants import DISPATCH_ON_LLM_TOOL_CHOICE
from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL
from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL_OUTPUT_USED
from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT
from ddtrace.llmobs._constants import EXPERIMENT_ID_KEY
from ddtrace.llmobs._constants import INPUT_DOCUMENTS
from ddtrace.llmobs._constants import INPUT_MESSAGES
Expand Down Expand Up @@ -241,6 +242,11 @@ def _llmobs_span_event(self, span: Span) -> LLMObsSpanEvent:
raise KeyError("Span kind not found in span context")

llmobs_span = LLMObsSpan()
_dd_attrs = {
"span_id": str(span.span_id),
"trace_id": format_trace_id(span.trace_id),
"apm_trace_id": format_trace_id(span.trace_id),
}

meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}}
if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None:
Expand All @@ -256,6 +262,12 @@ def _llmobs_span_event(self, span: Span) -> LLMObsSpanEvent:
{"content": safe_json(span._get_ctx_item(INPUT_VALUE), ensure_ascii=False), "role": ""}
]

if span.context.get_baggage_item(EXPERIMENT_ID_KEY):
_dd_attrs["scope"] = "experiments"
expected_output = span._get_ctx_item(EXPERIMENT_EXPECTED_OUTPUT)
if span_kind == "experiment" and expected_output:
meta["expected_output"] = expected_output

input_messages = span._get_ctx_item(INPUT_MESSAGES)
if span_kind == "llm" and input_messages is not None:
input_type = "messages"
Expand Down Expand Up @@ -349,11 +361,7 @@ def _llmobs_span_event(self, span: Span) -> LLMObsSpanEvent:
"meta": meta,
"metrics": metrics,
"tags": [],
"_dd": {
"span_id": str(span.span_id),
"trace_id": format_trace_id(span.trace_id),
"apm_trace_id": format_trace_id(span.trace_id),
},
"_dd": _dd_attrs,
}
session_id = _get_session_id(span)
if session_id is not None:
Expand Down
16 changes: 12 additions & 4 deletions ddtrace/llmobs/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,10 +544,18 @@ def enqueue(self, event: LLMObsSpanEvent) -> None:
self._enqueue(event, truncated_event_size or raw_event_size)

def _data(self, events: List[LLMObsSpanEvent]) -> List[Dict[str, Any]]:
return [
{"_dd.stage": "raw", "_dd.tracer_version": ddtrace.__version__, "event_type": "span", "spans": [event]}
for event in events
]
payload = []
for event in events:
event_data = {
"_dd.stage": "raw",
"_dd.tracer_version": ddtrace.__version__,
"event_type": "span",
"spans": [event],
}
if event.get("_dd", {}).get("scope") == "experiments":
event_data["_dd.scope"] = "experiments"
payload.append(event_data)
return payload


def _truncate_span_event(event: LLMObsSpanEvent) -> LLMObsSpanEvent:
Expand Down
22 changes: 22 additions & 0 deletions tests/llmobs/test_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,3 +507,25 @@ def test_experiment_run(llmobs, test_dataset_one_record):
assert exp_result["input"] == {"prompt": "What is the capital of France?"}
assert exp_result["output"] == {"prompt": "What is the capital of France?"}
assert exp_result["expected_output"] == {"answer": "Paris"}


def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test_dataset_one_record):
"""Assert that the experiment span includes expected output field and includes the experiment scope."""
exp = llmobs.experiment(
"test_experiment", dummy_task, test_dataset_one_record, [dummy_evaluator], project_name="test-project"
)
exp._id = "1234567890"
exp._run_task(1, raise_errors=False)
assert len(llmobs_events) == 1
event = llmobs_events[0]
assert event["name"] == "dummy_task"
for key in ("span_id", "trace_id", "parent_id", "start_ns", "duration", "metrics"):
assert event[key] == mock.ANY
assert event["status"] == "ok"
assert event["meta"]["input"] == {"value": '{"prompt": "What is the capital of France?"}'}
assert event["meta"]["output"] == {"value": '{"prompt": "What is the capital of France?"}'}
assert event["meta"]["expected_output"] == {"answer": "Paris"}
assert "dataset_id:{}".format(test_dataset_one_record._id) in event["tags"]
assert "dataset_record_id:{}".format(test_dataset_one_record._records[0]["record_id"]) in event["tags"]
assert "experiment_id:1234567890" in event["tags"]
assert event["_dd"]["scope"] == "experiments"
Loading