Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ddtrace/llmobs/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
EVP_PAYLOAD_SIZE_LIMIT = 5 << 20 # 5MB (actual limit is 5.1MB)
EVP_EVENT_SIZE_LIMIT = (1 << 20) - 1024 # 999KB (actual limit is 1MB)

EXPERIMENT_CSV_FIELD_MAX_SIZE = 10 * 1024 * 1024

DROPPED_IO_COLLECTION_ERROR = "dropped_io"
DROPPED_VALUE_TEXT = "[This value has been dropped because this span's size exceeds the 1MB size limit.]"
Expand Down
47 changes: 46 additions & 1 deletion ddtrace/llmobs/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,50 @@ def __len__(self) -> int:
def __iter__(self) -> Iterator[DatasetRecord]:
return iter(self._records)

def as_dataframe(self):
try:
import pandas as pd
except ImportError as e:
raise ImportError(
"pandas is required to convert dataset to DataFrame. Please install via `pip install pandas`"
) from e

column_tuples = set()
data_rows = []
for record in self._records:
flat_record = {}

input_data = record.get("input_data", {})
if isinstance(input_data, dict):
for k, v in input_data.items():
flat_record[("input_data", k)] = v
column_tuples.add(("input_data", k))
else:
flat_record[("input_data", "")] = input_data # Use empty string for single input
column_tuples.add(("input_data", ""))

expected_output = record.get("expected_output", {})
if isinstance(expected_output, dict):
for k, v in expected_output.items():
flat_record[("expected_output", k)] = v
column_tuples.add(("expected_output", k))
else:
flat_record[("expected_output", "")] = expected_output # Use empty string for single output
column_tuples.add(("expected_output", ""))

for k, v in record.get("metadata", {}).items():
flat_record[("metadata", k)] = v
column_tuples.add(("metadata", k))

data_rows.append(flat_record)

records_list = []
for flat_record in data_rows:
row = [flat_record.get(col, None) for col in column_tuples]
records_list.append(row)

return pd.DataFrame(data=records_list, columns=pd.MultiIndex.from_tuples(column_tuples))


class Experiment:
def __init__(
Expand Down Expand Up @@ -218,7 +262,8 @@ def run(
if not self._llmobs_instance.enabled:
logger.warning(
"Skipping experiment as LLMObs is not enabled. "
"Ensure LLM Observability is enabled via `LLMObs.enable(...)` or set `DD_LLMOBS_ENABLED=1`."
"Ensure LLM Observability is enabled via `LLMObs.enable(...)` "
"or set `DD_LLMOBS_ENABLED=1` and use `ddtrace-run` to run your application."
)
return []

Expand Down
65 changes: 64 additions & 1 deletion ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import csv
from dataclasses import dataclass
from dataclasses import field
import inspect
Expand Down Expand Up @@ -49,6 +50,7 @@
from ddtrace.llmobs._constants import DISPATCH_ON_LLM_TOOL_CHOICE
from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL
from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL_OUTPUT_USED
from ddtrace.llmobs._constants import EXPERIMENT_CSV_FIELD_MAX_SIZE
from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT
from ddtrace.llmobs._constants import EXPERIMENT_ID_KEY
from ddtrace.llmobs._constants import INPUT_DOCUMENTS
Expand Down Expand Up @@ -79,7 +81,7 @@
from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
from ddtrace.llmobs._experiment import Dataset
from ddtrace.llmobs._experiment import DatasetRecordInputType
from ddtrace.llmobs._experiment import DatasetRecordRaw as DatasetRecord
from ddtrace.llmobs._experiment import DatasetRecord
from ddtrace.llmobs._experiment import Experiment
from ddtrace.llmobs._experiment import ExperimentConfigType
from ddtrace.llmobs._experiment import JSONType
Expand Down Expand Up @@ -597,6 +599,67 @@ def create_dataset(cls, name: str, description: str, records: List[DatasetRecord
ds.push()
return ds

@classmethod
def create_dataset_from_csv(
cls,
csv_path: str,
dataset_name: str,
input_data_columns: List[str],
expected_output_columns: List[str],
metadata_columns: List[str] = [],
csv_delimiter: str = ",",
description="",
) -> Dataset:
ds = cls._instance._dne_client.dataset_create(dataset_name, description)

# Store the original field size limit to restore it later
original_field_size_limit = csv.field_size_limit()

csv.field_size_limit(EXPERIMENT_CSV_FIELD_MAX_SIZE) # 10mb

try:
with open(csv_path, mode="r") as csvfile:
content = csvfile.readline().strip()
if not content:
raise ValueError("CSV file appears to be empty or header is missing.")

csvfile.seek(0)

rows = csv.DictReader(csvfile, delimiter=csv_delimiter)

if rows.fieldnames is None:
raise ValueError("CSV file appears to be empty or header is missing.")

header_columns = rows.fieldnames
missing_input_columns = [col for col in input_data_columns if col not in header_columns]
missing_output_columns = [col for col in expected_output_columns if col not in header_columns]
missing_metadata_columns = [col for col in metadata_columns if col not in metadata_columns]

if any(col not in header_columns for col in input_data_columns):
raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}")
if any(col not in header_columns for col in expected_output_columns):
raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}")
if any(col not in header_columns for col in metadata_columns):
raise ValueError(f"Metadata columns not found in CSV header: {missing_metadata_columns}")

for row in rows:
ds.append(
DatasetRecord(
input_data={col: row[col] for col in input_data_columns},
expected_output={col: row[col] for col in expected_output_columns},
metadata={col: row[col] for col in metadata_columns},
record_id="",
)
)

finally:
# Always restore the original field size limit
csv.field_size_limit(original_field_size_limit)

if len(ds) > 0:
ds.push()
return ds

@classmethod
def _delete_dataset(cls, dataset_id: str) -> None:
return cls._instance._dne_client.dataset_delete(dataset_id)
Expand Down
1 change: 1 addition & 0 deletions riotfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3134,6 +3134,7 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT
"pytest-asyncio": "==0.21.1",
"ragas": "==0.1.21",
"langchain": latest,
"pandas": latest,
},
pys=select_pys(min_version="3.8"),
),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
interactions:
- request:
body: '{"data": {"type": "datasets", "id": "8dbba503-cf48-4e82-805e-1d9cb46e5ed3",
"attributes": {"insert_records": [{"input": {"in0": "r0v1", "in1": "r0v2", "in2":
"r0v3"}, "expected_output": {"out0": "r0v4", "out1": "r0v5"}, "metadata": {}},
{"input": {"in0": "r1v1", "in1": "r1v2", "in2": "r1v3"}, "expected_output":
{"out0": "r1v4", "out1": "r1v5"}, "metadata": {}}], "update_records": [], "delete_records":
[]}}}'
headers:
Accept:
- '*/*'
? !!python/object/apply:multidict._multidict.istr
- Accept-Encoding
: - identity
Connection:
- keep-alive
Content-Length:
- '410'
? !!python/object/apply:multidict._multidict.istr
- Content-Type
: - application/json
User-Agent:
- python-requests/2.32.3
method: POST
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/8dbba503-cf48-4e82-805e-1d9cb46e5ed3/batch_update
response:
body:
string: '{"data":[{"id":"79dc4c72-e4fd-4de7-b46e-419b8072a207","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449869Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449869Z","version":1}},{"id":"f0f3fe26-1060-452f-a292-30b1293dd5a1","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449869Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449869Z","version":1}}]}'
headers:
content-length:
- '812'
content-security-policy:
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
content-type:
- application/vnd.api+json
date:
- Wed, 23 Jul 2025 02:50:48 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
vary:
- Accept-Encoding
x-content-type-options:
- nosniff
x-frame-options:
- SAMEORIGIN
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
? !!python/object/apply:multidict._multidict.istr
- Accept-Encoding
: - identity
Connection:
- keep-alive
? !!python/object/apply:multidict._multidict.istr
- Content-Length
: - '0'
? !!python/object/apply:multidict._multidict.istr
- Content-Type
: - application/json
User-Agent:
- python-requests/2.32.3
method: GET
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/8dbba503-cf48-4e82-805e-1d9cb46e5ed3/records
response:
body:
string: '{"data":[{"id":"79dc4c72-e4fd-4de7-b46e-419b8072a207","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449Z"}},{"id":"f0f3fe26-1060-452f-a292-30b1293dd5a1","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449Z"}}],"meta":{"after":""}}'
headers:
content-length:
- '796'
content-security-policy:
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
content-type:
- application/vnd.api+json
date:
- Wed, 23 Jul 2025 02:50:52 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
vary:
- Accept-Encoding
x-content-type-options:
- nosniff
x-frame-options:
- SAMEORIGIN
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
interactions:
- request:
body: '{"data": {"type": "datasets", "id": "acf19ca4-8062-4548-abbf-95b33f55d51d",
"attributes": {"insert_records": [{"input": {"in0": "r0v1", "in1": "r0v2", "in2":
"r0v3"}, "expected_output": {"out0": "r0v4", "out1": "r0v5"}, "metadata": {"m0":
"r0v6"}}, {"input": {"in0": "r1v1", "in1": "r1v2", "in2": "r1v3"}, "expected_output":
{"out0": "r1v4", "out1": "r1v5"}, "metadata": {"m0": "r1v6"}}], "update_records":
[], "delete_records": []}}}'
headers:
Accept:
- '*/*'
? !!python/object/apply:multidict._multidict.istr
- Accept-Encoding
: - identity
Connection:
- keep-alive
Content-Length:
- '434'
? !!python/object/apply:multidict._multidict.istr
- Content-Type
: - application/json
User-Agent:
- python-requests/2.32.3
method: POST
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/acf19ca4-8062-4548-abbf-95b33f55d51d/batch_update
response:
body:
string: '{"data":[{"id":"016350fb-8b44-4d5d-af57-0c8408bb5d25","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124332Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{"m0":"r0v6"},"updated_at":"2025-07-23T02:50:52.795124332Z","version":1}},{"id":"91fcfd23-040c-40ad-a0b0-f497f10b41e3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124332Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{"m0":"r1v6"},"updated_at":"2025-07-23T02:50:52.795124332Z","version":1}}]}'
headers:
content-length:
- '834'
content-security-policy:
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
content-type:
- application/vnd.api+json
date:
- Wed, 23 Jul 2025 02:50:52 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
vary:
- Accept-Encoding
x-content-type-options:
- nosniff
x-frame-options:
- SAMEORIGIN
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
? !!python/object/apply:multidict._multidict.istr
- Accept-Encoding
: - identity
Connection:
- keep-alive
? !!python/object/apply:multidict._multidict.istr
- Content-Length
: - '0'
? !!python/object/apply:multidict._multidict.istr
- Content-Type
: - application/json
User-Agent:
- python-requests/2.32.3
method: GET
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/acf19ca4-8062-4548-abbf-95b33f55d51d/records
response:
body:
string: '{"data":[{"id":"016350fb-8b44-4d5d-af57-0c8408bb5d25","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{"m0":"r0v6"},"updated_at":"2025-07-23T02:50:52.795124Z"}},{"id":"91fcfd23-040c-40ad-a0b0-f497f10b41e3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{"m0":"r1v6"},"updated_at":"2025-07-23T02:50:52.795124Z"}}],"meta":{"after":""}}'
headers:
content-length:
- '818'
content-security-policy:
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
content-type:
- application/vnd.api+json
date:
- Wed, 23 Jul 2025 02:50:55 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
vary:
- Accept-Encoding
x-content-type-options:
- nosniff
x-frame-options:
- SAMEORIGIN
status:
code: 200
message: OK
version: 1
Loading
Loading