@@ -668,3 +668,132 @@ def test_format_experiment_results_basic():
668
668
669
669
langfuse_client .flush ()
670
670
time .sleep (1 )
671
+
672
+
673
+ def test_boolean_score_types ():
674
+ """Test that BOOLEAN score types are properly ingested and persisted."""
675
+ from langfuse .api import ScoreDataType
676
+
677
+ langfuse_client = get_client ()
678
+
679
+ def boolean_evaluator (* , input , output , expected_output = None , ** kwargs ):
680
+ """Boolean evaluator that checks if output contains the expected answer."""
681
+ if not expected_output :
682
+ return Evaluation (
683
+ name = "has_expected_content" ,
684
+ value = False ,
685
+ data_type = ScoreDataType .BOOLEAN ,
686
+ comment = "No expected output to check" ,
687
+ )
688
+
689
+ contains_expected = expected_output .lower () in str (output ).lower ()
690
+ return Evaluation (
691
+ name = "has_expected_content" ,
692
+ value = contains_expected ,
693
+ data_type = ScoreDataType .BOOLEAN ,
694
+ comment = f"Output { 'contains' if contains_expected else 'does not contain' } expected content" ,
695
+ )
696
+
697
+ def boolean_run_evaluator (* , item_results : List [ExperimentItemResult ], ** kwargs ):
698
+ """Run evaluator that returns boolean based on all items passing."""
699
+ if not item_results :
700
+ return Evaluation (
701
+ name = "all_items_pass" ,
702
+ value = False ,
703
+ data_type = ScoreDataType .BOOLEAN ,
704
+ comment = "No items to evaluate" ,
705
+ )
706
+
707
+ # Check if all boolean evaluations are True
708
+ all_pass = True
709
+ for item_result in item_results :
710
+ for evaluation in item_result .evaluations :
711
+ if (
712
+ evaluation .name == "has_expected_content"
713
+ and evaluation .value is False
714
+ ):
715
+ all_pass = False
716
+ break
717
+ if not all_pass :
718
+ break
719
+
720
+ return Evaluation (
721
+ name = "all_items_pass" ,
722
+ value = all_pass ,
723
+ data_type = ScoreDataType .BOOLEAN ,
724
+ comment = f"{ 'All' if all_pass else 'Not all' } items passed the boolean evaluation" ,
725
+ )
726
+
727
+ # Test data where some items should pass and some should fail
728
+ test_data = [
729
+ {"input" : "What is the capital of Germany?" , "expected_output" : "Berlin" },
730
+ {"input" : "What is the capital of France?" , "expected_output" : "Paris" },
731
+ {"input" : "What is the capital of Spain?" , "expected_output" : "Madrid" },
732
+ ]
733
+
734
+ # Task that returns correct answers for Germany and France, but wrong for Spain
735
+ def mock_task_with_boolean_results (* , item : ExperimentItem , ** kwargs ):
736
+ input_val = (
737
+ item .get ("input" )
738
+ if isinstance (item , dict )
739
+ else getattr (item , "input" , "unknown" )
740
+ )
741
+ input_str = str (input_val ) if input_val is not None else ""
742
+
743
+ if "Germany" in input_str :
744
+ return "The capital is Berlin"
745
+ elif "France" in input_str :
746
+ return "The capital is Paris"
747
+ else :
748
+ return "I don't know the capital"
749
+
750
+ result = langfuse_client .run_experiment (
751
+ name = "Boolean score type test" ,
752
+ description = "Test BOOLEAN data type in scores" ,
753
+ data = test_data ,
754
+ task = mock_task_with_boolean_results ,
755
+ evaluators = [boolean_evaluator ],
756
+ run_evaluators = [boolean_run_evaluator ],
757
+ )
758
+
759
+ # Validate basic result structure
760
+ assert len (result .item_results ) == 3
761
+ assert len (result .run_evaluations ) == 1
762
+
763
+ # Validate individual item evaluations have boolean values
764
+ expected_results = [
765
+ True ,
766
+ True ,
767
+ False ,
768
+ ] # Germany and France should pass, Spain should fail
769
+ for i , item_result in enumerate (result .item_results ):
770
+ assert len (item_result .evaluations ) == 1
771
+ eval_result = item_result .evaluations [0 ]
772
+ assert eval_result .name == "has_expected_content"
773
+ assert isinstance (eval_result .value , bool )
774
+ assert eval_result .value == expected_results [i ]
775
+ assert eval_result .data_type == ScoreDataType .BOOLEAN
776
+
777
+ # Validate run evaluation is boolean and should be False (not all items passed)
778
+ run_eval = result .run_evaluations [0 ]
779
+ assert run_eval .name == "all_items_pass"
780
+ assert isinstance (run_eval .value , bool )
781
+ assert run_eval .value is False # Spain should fail, so not all pass
782
+ assert run_eval .data_type == ScoreDataType .BOOLEAN
783
+
784
+ # Flush and wait for server processing
785
+ langfuse_client .flush ()
786
+ time .sleep (3 )
787
+
788
+ # Verify scores are persisted via API with correct data types
789
+ api = get_api ()
790
+ for i , item_result in enumerate (result .item_results ):
791
+ trace_id = item_result .trace_id
792
+ assert trace_id is not None , f"Item { i } should have a trace_id"
793
+
794
+ # Fetch trace from API to verify score persistence
795
+ trace = api .trace .get (trace_id )
796
+ assert trace is not None , f"Trace { trace_id } should exist"
797
+
798
+ for score in trace .scores :
799
+ assert score .data_type == "BOOLEAN"
0 commit comments