Skip to content

fix: multi-root traces silently drop root spans from evaluation and export#2599

Open
aerosta wants to merge 1 commit intoconfident-ai:mainfrom
aerosta:fix/multi-root-trace-traversal
Open

fix: multi-root traces silently drop root spans from evaluation and export#2599
aerosta wants to merge 1 commit intoconfident-ai:mainfrom
aerosta:fix/multi-root-trace-traversal

Conversation

@aerosta
Copy link
Copy Markdown
Contributor

@aerosta aerosta commented Apr 4, 2026

Summary

Multi-root traces were handled inconsistently across the evaluation pipeline. Metric counting already treated root_spans as a list, but several traversal and trace serialization paths still only used the first root span. As a result, additional root subtrees could be omitted from grading and from the trace data passed to trace-dependent metrics.

This change updates traversal, serialization, and trace-level error propagation to operate on every root span so multi-root traces are evaluated and exported consistently.

Changes

  • preserve all real root spans when unwrapping the evaluation dummy root
  • add a trace-level serialization helper that includes every root span while keeping the existing single-root shape unchanged
  • update async and sync trace evaluation paths to traverse every root span
  • update trace-level error handling to mark all root spans, not just the first
  • add regression coverage for multi-root serialization, traversal, and dummy-root unwrapping

Test plan

DEEPEVAL_TELEMETRY_OPT_OUT=1 python -m pytest tests/test_core/test_evaluation/test_multi_root_trace_handling.py -q --tb=short
DEEPEVAL_TELEMETRY_OPT_OUT=1 python -m pytest tests/test_core/test_evaluation/test_trace_results_extraction.py -q --tb=short
DEEPEVAL_TELEMETRY_OPT_OUT=1 python -m pytest tests/test_core/test_evaluation/test_execute/test_error_boundary.py -q --tb=short -k multiple_roots

@vercel
Copy link
Copy Markdown

vercel bot commented Apr 4, 2026

@aerosta is attempting to deploy a commit to the Confident AI Team on Vercel.

A member of the Team first needs to authorize it.

@penguine-ip
Copy link
Copy Markdown
Contributor

Hey @aerosta thank you for this, do you mind sharing how could I reproduce the issues so I could merge this more safely? Thank you!

@aerosta
Copy link
Copy Markdown
Contributor Author

aerosta commented Apr 10, 2026

Hey @aerosta thank you for this, do you mind sharing how could I reproduce the issues so I could merge this more safely? Thank you!

Hey @penguine-ip thanks for taking a look. I reproduced this on the current main.

import asyncio
import time
from importlib import import_module

from deepeval.dataset import Golden
from deepeval.metrics import BaseMetric
from deepeval.test_run import TestRunManager
from deepeval.tracing.types import LlmSpan, Trace, TraceSpanStatus

exec_mod = import_module("deepeval.evaluate.execute")


class RecordingAsyncMetric(BaseMetric):
    def __init__(self, name: str, *, requires_trace: bool = False):
        self.name = name
        self.requires_trace = requires_trace
        self.threshold = 0.5
        self.score = None
        self.reason = None
        self.success = None
        self.error = None
        self.strict_mode = False
        self.evaluation_model = None
        self.evaluation_cost = None
        self.verbose_logs = None
        self.skipped = False
        self.captured_trace = None

    @property
    def __name__(self):
        return self.name

    async def a_measure(self, test_case, *args, **kwargs):
        if self.requires_trace:
            self.captured_trace = test_case._trace_dict
        self.score = 1.0
        self.reason = "ok"
        self.success = True
        return self.score

    def measure(self, test_case, *args, **kwargs):
        raise NotImplementedError

    def is_successful(self):
        return bool(self.success)


def make_llm_span(trace_uuid, name, *, parent_uuid=None, children=None, metrics=None):
    now = time.perf_counter()
    return LlmSpan(
        uuid=f"{trace_uuid}-{name}",
        trace_uuid=trace_uuid,
        parent_uuid=parent_uuid,
        start_time=now,
        end_time=now,
        status=TraceSpanStatus.SUCCESS,
        children=children or [],
        name=name,
        input=f"{name}-input",
        output=f"{name}-output",
        metrics=metrics or [],
    )


def make_multi_root_trace():
    trace_uuid = "trace-multi-root"
    child_one = make_llm_span(
        trace_uuid, "child-1", parent_uuid=f"{trace_uuid}-root-1", metrics=[RecordingAsyncMetric("child-1-metric")]
    )
    child_two = make_llm_span(
        trace_uuid, "child-2", parent_uuid=f"{trace_uuid}-root-2", metrics=[RecordingAsyncMetric("child-2-metric")]
    )
    root_one = make_llm_span(trace_uuid, "root-1", children=[child_one], metrics=[RecordingAsyncMetric("root-1-metric")])
    root_two = make_llm_span(trace_uuid, "root-2", children=[child_two], metrics=[RecordingAsyncMetric("root-2-metric")])

    now = time.perf_counter()
    trace_metric = RecordingAsyncMetric("trace-metric", requires_trace=True)
    trace = Trace(
        uuid=trace_uuid,
        status=TraceSpanStatus.SUCCESS,
        root_spans=[root_one, root_two],
        start_time=now,
        end_time=now,
        input="trace-input",
        output="trace-output",
        metrics=[trace_metric],
    )
    return trace, trace_metric


async def main():
    trace, trace_metric = make_multi_root_trace()
    captured = {}
    test_run_manager = TestRunManager()
    test_run_manager.update_test_run = lambda api_test_case, _test_case: captured.setdefault("trace_api", api_test_case.trace)
    exec_mod.extract_trace_test_results = lambda _api: []

    await exec_mod._a_execute_agentic_test_case(
        golden=Golden(input="golden-input"),
        test_run_manager=test_run_manager,
        test_results=[],
        count=1,
        verbose_mode=False,
        ignore_errors=False,
        skip_on_missing_params=False,
        show_indicator=False,
        _use_bar_indicator=False,
        _is_assert_test=False,
        observed_callback=None,
        trace=trace,
        trace_metrics=None,
        progress=None,
        pbar_id=None,
    )

    trace_api = captured["trace_api"]
    metric_counts = {span.name: len(span.metrics_data or []) for span in trace_api.llm_spans}
    serialized = trace_metric.captured_trace
    serialized_names = [serialized.get("name")] + [child.get("name") for child in serialized.get("children", [])]

    print("Current trace roots:", [span.name for span in trace.root_spans])
    print("Serialized names seen by metric:", serialized_names)
    print("Serialized roots dropped:", [name for name in ("root-2", "child-2") if name not in serialized_names])
    print("Per-span metrics recorded by DFS traversal:", metric_counts)
    print("DFS dropped spans:", [name for name in ("root-2", "child-2") if metric_counts.get(name, 0) == 0])


asyncio.run(main())

Output:

Current trace roots: ['root-1', 'root-2']
Serialized names seen by metric: ['root-1', 'child-1']
Serialized roots dropped: ['root-2', 'child-2']
Per-span metrics recorded by DFS traversal: {'root-1': 1, 'child-1': 1}
DFS dropped spans: ['root-2', 'child-2']

Please let me know if any additional details would be helpful.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants