Skip to content

Commit 0dc0d2c

Browse files
authored
fix(experiments): pass full evaluation to score creation (#1391)
1 parent 06912ce commit 0dc0d2c

File tree

2 files changed

+133
-1
lines changed

2 files changed

+133
-1
lines changed

langfuse/_client/client.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2730,6 +2730,7 @@ async def process_item(item: ExperimentItem) -> ExperimentItemResult:
27302730
comment=evaluation.comment,
27312731
metadata=evaluation.metadata,
27322732
data_type=evaluation.data_type, # type: ignore
2733+
config_id=evaluation.config_id,
27332734
)
27342735

27352736
except Exception as e:
@@ -2856,9 +2857,11 @@ async def _process_experiment_item(
28562857
self.create_score(
28572858
trace_id=trace_id,
28582859
name=evaluation.name,
2859-
value=evaluation.value or -1,
2860+
value=evaluation.value, # type: ignore
28602861
comment=evaluation.comment,
28612862
metadata=evaluation.metadata,
2863+
config_id=evaluation.config_id,
2864+
data_type=evaluation.data_type, # type: ignore
28622865
)
28632866

28642867
except Exception as e:

tests/test_experiments.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,3 +668,132 @@ def test_format_experiment_results_basic():
668668

669669
langfuse_client.flush()
670670
time.sleep(1)
671+
672+
673+
def test_boolean_score_types():
674+
"""Test that BOOLEAN score types are properly ingested and persisted."""
675+
from langfuse.api import ScoreDataType
676+
677+
langfuse_client = get_client()
678+
679+
def boolean_evaluator(*, input, output, expected_output=None, **kwargs):
680+
"""Boolean evaluator that checks if output contains the expected answer."""
681+
if not expected_output:
682+
return Evaluation(
683+
name="has_expected_content",
684+
value=False,
685+
data_type=ScoreDataType.BOOLEAN,
686+
comment="No expected output to check",
687+
)
688+
689+
contains_expected = expected_output.lower() in str(output).lower()
690+
return Evaluation(
691+
name="has_expected_content",
692+
value=contains_expected,
693+
data_type=ScoreDataType.BOOLEAN,
694+
comment=f"Output {'contains' if contains_expected else 'does not contain'} expected content",
695+
)
696+
697+
def boolean_run_evaluator(*, item_results: List[ExperimentItemResult], **kwargs):
698+
"""Run evaluator that returns boolean based on all items passing."""
699+
if not item_results:
700+
return Evaluation(
701+
name="all_items_pass",
702+
value=False,
703+
data_type=ScoreDataType.BOOLEAN,
704+
comment="No items to evaluate",
705+
)
706+
707+
# Check if all boolean evaluations are True
708+
all_pass = True
709+
for item_result in item_results:
710+
for evaluation in item_result.evaluations:
711+
if (
712+
evaluation.name == "has_expected_content"
713+
and evaluation.value is False
714+
):
715+
all_pass = False
716+
break
717+
if not all_pass:
718+
break
719+
720+
return Evaluation(
721+
name="all_items_pass",
722+
value=all_pass,
723+
data_type=ScoreDataType.BOOLEAN,
724+
comment=f"{'All' if all_pass else 'Not all'} items passed the boolean evaluation",
725+
)
726+
727+
# Test data where some items should pass and some should fail
728+
test_data = [
729+
{"input": "What is the capital of Germany?", "expected_output": "Berlin"},
730+
{"input": "What is the capital of France?", "expected_output": "Paris"},
731+
{"input": "What is the capital of Spain?", "expected_output": "Madrid"},
732+
]
733+
734+
# Task that returns correct answers for Germany and France, but wrong for Spain
735+
def mock_task_with_boolean_results(*, item: ExperimentItem, **kwargs):
736+
input_val = (
737+
item.get("input")
738+
if isinstance(item, dict)
739+
else getattr(item, "input", "unknown")
740+
)
741+
input_str = str(input_val) if input_val is not None else ""
742+
743+
if "Germany" in input_str:
744+
return "The capital is Berlin"
745+
elif "France" in input_str:
746+
return "The capital is Paris"
747+
else:
748+
return "I don't know the capital"
749+
750+
result = langfuse_client.run_experiment(
751+
name="Boolean score type test",
752+
description="Test BOOLEAN data type in scores",
753+
data=test_data,
754+
task=mock_task_with_boolean_results,
755+
evaluators=[boolean_evaluator],
756+
run_evaluators=[boolean_run_evaluator],
757+
)
758+
759+
# Validate basic result structure
760+
assert len(result.item_results) == 3
761+
assert len(result.run_evaluations) == 1
762+
763+
# Validate individual item evaluations have boolean values
764+
expected_results = [
765+
True,
766+
True,
767+
False,
768+
] # Germany and France should pass, Spain should fail
769+
for i, item_result in enumerate(result.item_results):
770+
assert len(item_result.evaluations) == 1
771+
eval_result = item_result.evaluations[0]
772+
assert eval_result.name == "has_expected_content"
773+
assert isinstance(eval_result.value, bool)
774+
assert eval_result.value == expected_results[i]
775+
assert eval_result.data_type == ScoreDataType.BOOLEAN
776+
777+
# Validate run evaluation is boolean and should be False (not all items passed)
778+
run_eval = result.run_evaluations[0]
779+
assert run_eval.name == "all_items_pass"
780+
assert isinstance(run_eval.value, bool)
781+
assert run_eval.value is False # Spain should fail, so not all pass
782+
assert run_eval.data_type == ScoreDataType.BOOLEAN
783+
784+
# Flush and wait for server processing
785+
langfuse_client.flush()
786+
time.sleep(3)
787+
788+
# Verify scores are persisted via API with correct data types
789+
api = get_api()
790+
for i, item_result in enumerate(result.item_results):
791+
trace_id = item_result.trace_id
792+
assert trace_id is not None, f"Item {i} should have a trace_id"
793+
794+
# Fetch trace from API to verify score persistence
795+
trace = api.trace.get(trace_id)
796+
assert trace is not None, f"Trace {trace_id} should exist"
797+
798+
for score in trace.scores:
799+
assert score.data_type == "BOOLEAN"

0 commit comments

Comments
 (0)