diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index 5e511d4694b..6e96340be21 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -96,3 +96,4 @@ PROXY_REQUEST = "llmobs.proxy_request" EXPERIMENT_ID_KEY = "_ml_obs.experiment_id" +EXPERIMENT_EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output" diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 15f4e069be7..055dc319d8f 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -20,6 +20,7 @@ from ddtrace.constants import ERROR_STACK from ddtrace.constants import ERROR_TYPE from ddtrace.internal.logger import get_logger +from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT if TYPE_CHECKING: @@ -262,6 +263,7 @@ def _process_record(self, idx_record: Tuple[int, DatasetRecord]) -> Optional[Tas except Exception: span.set_exc_info(*sys.exc_info()) self._llmobs_instance.annotate(span, input_data=input_data, output_data=output_data, tags=tags) + span._set_ctx_item(EXPERIMENT_EXPECTED_OUTPUT, record["expected_output"]) return { "idx": idx, "span_id": span_id, diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 9962db01c2c..66df3ffad45 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -48,6 +48,7 @@ from ddtrace.llmobs._constants import DISPATCH_ON_LLM_TOOL_CHOICE from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL_OUTPUT_USED +from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT from ddtrace.llmobs._constants import EXPERIMENT_ID_KEY from ddtrace.llmobs._constants import INPUT_DOCUMENTS from ddtrace.llmobs._constants import INPUT_MESSAGES @@ -241,6 +242,11 @@ def _llmobs_span_event(self, span: Span) -> LLMObsSpanEvent: raise KeyError("Span kind not found in span context") llmobs_span = LLMObsSpan() + _dd_attrs = { + "span_id": str(span.span_id), + "trace_id": format_trace_id(span.trace_id), + "apm_trace_id": format_trace_id(span.trace_id), + } meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}} if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None: @@ -256,6 +262,12 @@ def _llmobs_span_event(self, span: Span) -> LLMObsSpanEvent: {"content": safe_json(span._get_ctx_item(INPUT_VALUE), ensure_ascii=False), "role": ""} ] + if span.context.get_baggage_item(EXPERIMENT_ID_KEY): + _dd_attrs["scope"] = "experiments" + expected_output = span._get_ctx_item(EXPERIMENT_EXPECTED_OUTPUT) + if span_kind == "experiment" and expected_output: + meta["expected_output"] = expected_output + input_messages = span._get_ctx_item(INPUT_MESSAGES) if span_kind == "llm" and input_messages is not None: input_type = "messages" @@ -349,11 +361,7 @@ def _llmobs_span_event(self, span: Span) -> LLMObsSpanEvent: "meta": meta, "metrics": metrics, "tags": [], - "_dd": { - "span_id": str(span.span_id), - "trace_id": format_trace_id(span.trace_id), - "apm_trace_id": format_trace_id(span.trace_id), - }, + "_dd": _dd_attrs, } session_id = _get_session_id(span) if session_id is not None: diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index aa6fea4ce7e..af27a6efdba 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -544,10 +544,18 @@ def enqueue(self, event: LLMObsSpanEvent) -> None: self._enqueue(event, truncated_event_size or raw_event_size) def _data(self, events: List[LLMObsSpanEvent]) -> List[Dict[str, Any]]: - return [ - {"_dd.stage": "raw", "_dd.tracer_version": ddtrace.__version__, "event_type": "span", "spans": [event]} - for event in events - ] + payload = [] + for event in events: + event_data = { + "_dd.stage": "raw", + "_dd.tracer_version": ddtrace.__version__, + "event_type": "span", + "spans": [event], + } + if event.get("_dd", {}).get("scope") == "experiments": + event_data["_dd.scope"] = "experiments" + payload.append(event_data) + return payload def _truncate_span_event(event: LLMObsSpanEvent) -> LLMObsSpanEvent: diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 7b99f69e4fa..f6ff6542f4d 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -507,3 +507,25 @@ def test_experiment_run(llmobs, test_dataset_one_record): assert exp_result["input"] == {"prompt": "What is the capital of France?"} assert exp_result["output"] == {"prompt": "What is the capital of France?"} assert exp_result["expected_output"] == {"answer": "Paris"} + + +def test_experiment_span_written_to_experiment_scope(llmobs, llmobs_events, test_dataset_one_record): + """Assert that the experiment span includes expected output field and includes the experiment scope.""" + exp = llmobs.experiment( + "test_experiment", dummy_task, test_dataset_one_record, [dummy_evaluator], project_name="test-project" + ) + exp._id = "1234567890" + exp._run_task(1, raise_errors=False) + assert len(llmobs_events) == 1 + event = llmobs_events[0] + assert event["name"] == "dummy_task" + for key in ("span_id", "trace_id", "parent_id", "start_ns", "duration", "metrics"): + assert event[key] == mock.ANY + assert event["status"] == "ok" + assert event["meta"]["input"] == {"value": '{"prompt": "What is the capital of France?"}'} + assert event["meta"]["output"] == {"value": '{"prompt": "What is the capital of France?"}'} + assert event["meta"]["expected_output"] == {"answer": "Paris"} + assert "dataset_id:{}".format(test_dataset_one_record._id) in event["tags"] + assert "dataset_record_id:{}".format(test_dataset_one_record._records[0]["record_id"]) in event["tags"] + assert "experiment_id:1234567890" in event["tags"] + assert event["_dd"]["scope"] == "experiments"