Skip to content

Commit c28da61

Browse files
authored
chore(llmobs): submit experiment spans to experiment scope (#14056)
[MLOB-3298] This PR adds support for: 1. Annotating experiment spans with `expected_output` 2. Submitting experiment spans/traces to the LLMObs experiment scope. Notes: - I/O for experiment spans currently are annotated the same way as regular LLMObs spans, i.e. `meta.input.value` and `meta.output.value`. It looks like experiments use just `meta.input/output`. This shouldn't be blocking for now but should be followed up on. - We are currently using context baggage to determine when a span is considered an experiment span. We might want to follow the classic span store/request header pattern in the future. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) [MLOB-3298]: https://datadoghq.atlassian.net/browse/MLOB-3298?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
1 parent 9133851 commit c28da61

File tree

5 files changed

+50
-9
lines changed

5 files changed

+50
-9
lines changed

ddtrace/llmobs/_constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,4 @@
9696
PROXY_REQUEST = "llmobs.proxy_request"
9797

9898
EXPERIMENT_ID_KEY = "_ml_obs.experiment_id"
99+
EXPERIMENT_EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output"

ddtrace/llmobs/_experiment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from ddtrace.constants import ERROR_STACK
2121
from ddtrace.constants import ERROR_TYPE
2222
from ddtrace.internal.logger import get_logger
23+
from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT
2324

2425

2526
if TYPE_CHECKING:
@@ -262,6 +263,7 @@ def _process_record(self, idx_record: Tuple[int, DatasetRecord]) -> Optional[Tas
262263
except Exception:
263264
span.set_exc_info(*sys.exc_info())
264265
self._llmobs_instance.annotate(span, input_data=input_data, output_data=output_data, tags=tags)
266+
span._set_ctx_item(EXPERIMENT_EXPECTED_OUTPUT, record["expected_output"])
265267
return {
266268
"idx": idx,
267269
"span_id": span_id,

ddtrace/llmobs/_llmobs.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
from ddtrace.llmobs._constants import DISPATCH_ON_LLM_TOOL_CHOICE
4949
from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL
5050
from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL_OUTPUT_USED
51+
from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT
5152
from ddtrace.llmobs._constants import EXPERIMENT_ID_KEY
5253
from ddtrace.llmobs._constants import INPUT_DOCUMENTS
5354
from ddtrace.llmobs._constants import INPUT_MESSAGES
@@ -241,6 +242,11 @@ def _llmobs_span_event(self, span: Span) -> LLMObsSpanEvent:
241242
raise KeyError("Span kind not found in span context")
242243

243244
llmobs_span = LLMObsSpan()
245+
_dd_attrs = {
246+
"span_id": str(span.span_id),
247+
"trace_id": format_trace_id(span.trace_id),
248+
"apm_trace_id": format_trace_id(span.trace_id),
249+
}
244250

245251
meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}}
246252
if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None:
@@ -256,6 +262,12 @@ def _llmobs_span_event(self, span: Span) -> LLMObsSpanEvent:
256262
{"content": safe_json(span._get_ctx_item(INPUT_VALUE), ensure_ascii=False), "role": ""}
257263
]
258264

265+
if span.context.get_baggage_item(EXPERIMENT_ID_KEY):
266+
_dd_attrs["scope"] = "experiments"
267+
expected_output = span._get_ctx_item(EXPERIMENT_EXPECTED_OUTPUT)
268+
if span_kind == "experiment" and expected_output:
269+
meta["expected_output"] = expected_output
270+
259271
input_messages = span._get_ctx_item(INPUT_MESSAGES)
260272
if span_kind == "llm" and input_messages is not None:
261273
input_type = "messages"
@@ -349,11 +361,7 @@ def _llmobs_span_event(self, span: Span) -> LLMObsSpanEvent:
349361
"meta": meta,
350362
"metrics": metrics,
351363
"tags": [],
352-
"_dd": {
353-
"span_id": str(span.span_id),
354-
"trace_id": format_trace_id(span.trace_id),
355-
"apm_trace_id": format_trace_id(span.trace_id),
356-
},
364+
"_dd": _dd_attrs,
357365
}
358366
session_id = _get_session_id(span)
359367
if session_id is not None:

ddtrace/llmobs/_writer.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -544,10 +544,18 @@ def enqueue(self, event: LLMObsSpanEvent) -> None:
544544
self._enqueue(event, truncated_event_size or raw_event_size)
545545

546546
def _data(self, events: List[LLMObsSpanEvent]) -> List[Dict[str, Any]]:
547-
return [
548-
{"_dd.stage": "raw", "_dd.tracer_version": ddtrace.__version__, "event_type": "span", "spans": [event]}
549-
for event in events
550-
]
547+
payload = []
548+
for event in events:
549+
event_data = {
550+
"_dd.stage": "raw",
551+
"_dd.tracer_version": ddtrace.__version__,
552+
"event_type": "span",
553+
"spans": [event],
554+
}
555+
if event.get("_dd", {}).get("scope") == "experiments":
556+
event_data["_dd.scope"] = "experiments"
557+
payload.append(event_data)
558+
return payload
551559

552560

553561
def _truncate_span_event(event: LLMObsSpanEvent) -> LLMObsSpanEvent:

tests/llmobs/test_experiments.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,3 +507,25 @@ def test_experiment_run(llmobs, test_dataset_one_record):
507507
assert exp_result["input"] == {"prompt": "What is the capital of France?"}
508508
assert exp_result["output"] == {"prompt": "What is the capital of France?"}
509509
assert exp_result["expected_output"] == {"answer": "Paris"}
510+
511+
512+
def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test_dataset_one_record):
513+
"""Assert that the experiment span includes expected output field and includes the experiment scope."""
514+
exp = llmobs.experiment(
515+
"test_experiment", dummy_task, test_dataset_one_record, [dummy_evaluator], project_name="test-project"
516+
)
517+
exp._id = "1234567890"
518+
exp._run_task(1, raise_errors=False)
519+
assert len(llmobs_events) == 1
520+
event = llmobs_events[0]
521+
assert event["name"] == "dummy_task"
522+
for key in ("span_id", "trace_id", "parent_id", "start_ns", "duration", "metrics"):
523+
assert event[key] == mock.ANY
524+
assert event["status"] == "ok"
525+
assert event["meta"]["input"] == {"value": '{"prompt": "What is the capital of France?"}'}
526+
assert event["meta"]["output"] == {"value": '{"prompt": "What is the capital of France?"}'}
527+
assert event["meta"]["expected_output"] == {"answer": "Paris"}
528+
assert "dataset_id:{}".format(test_dataset_one_record._id) in event["tags"]
529+
assert "dataset_record_id:{}".format(test_dataset_one_record._records[0]["record_id"]) in event["tags"]
530+
assert "experiment_id:1234567890" in event["tags"]
531+
assert event["_dd"]["scope"] == "experiments"

0 commit comments

Comments
 (0)