1+ """
2+ Comprehensive comparison of Splinator vs scikit-learn calibration methods
3+ """
4+ import numpy as np
5+ import pandas as pd
6+ from sklearn .datasets import make_classification
7+ from sklearn .model_selection import train_test_split
8+ from sklearn .ensemble import GradientBoostingClassifier , RandomForestClassifier
9+ from sklearn .linear_model import LogisticRegression
10+ from sklearn .isotonic import IsotonicRegression
11+ from sklearn .calibration import calibration_curve
12+ from sklearn .metrics import brier_score_loss , log_loss
13+ from splinator .estimators import LinearSplineLogisticRegression
14+ from splinator .metrics import expected_calibration_error , spiegelhalters_z_statistic
15+ import time
16+ import warnings
17+ warnings .filterwarnings ('ignore' )
18+
19+ def evaluate_calibrator (y_true , y_pred_calibrated , method_name ):
20+ """Evaluate calibration performance with multiple metrics"""
21+ results = {
22+ 'method' : method_name ,
23+ 'brier_score' : brier_score_loss (y_true , y_pred_calibrated ),
24+ 'log_loss' : log_loss (y_true , y_pred_calibrated ),
25+ 'ece' : expected_calibration_error (y_true , y_pred_calibrated , n_bins = 10 ),
26+ 'spiegelhalter_z' : abs (spiegelhalters_z_statistic (y_true , y_pred_calibrated ))
27+ }
28+ return results
29+
30+ def run_comparison (n_samples = 50000 , n_features = 20 , n_knots = 50 ):
31+ """Run comprehensive comparison"""
32+ print (f"\n Running comparison with { n_samples } samples, { n_features } features, { n_knots } knots" )
33+
34+ # Generate data
35+ # Ensure n_informative + n_redundant < n_features
36+ n_informative = max (1 , int (n_features * 0.6 )) # 60% informative
37+ n_redundant = max (1 , int (n_features * 0.2 )) # 20% redundant
38+
39+ X , y = make_classification (n_samples = n_samples , n_features = n_features ,
40+ n_informative = n_informative , n_redundant = n_redundant ,
41+ flip_y = 0.1 , random_state = 42 )
42+
43+ # Split data
44+ X_train , X_test , y_train , y_test = train_test_split (X , y , test_size = 0.2 , random_state = 42 )
45+ X_train , X_cal , y_train , y_cal = train_test_split (X_train , y_train , test_size = 0.25 , random_state = 42 )
46+
47+ # Train base classifier
48+ clf = GradientBoostingClassifier (n_estimators = 100 , random_state = 42 )
49+ clf .fit (X_train , y_train )
50+
51+ # Get uncalibrated predictions
52+ pred_cal = clf .predict_proba (X_cal )[:, 1 ]
53+ pred_test = clf .predict_proba (X_test )[:, 1 ]
54+
55+ results = []
56+
57+ # Uncalibrated baseline
58+ results .append (evaluate_calibrator (y_test , pred_test , 'Uncalibrated' ))
59+
60+ # 1. Sigmoid Calibration (Platt Scaling)
61+ start = time .time ()
62+ lr = LogisticRegression ()
63+ lr .fit (pred_cal .reshape (- 1 , 1 ), y_cal )
64+ lr_calibrated = lr .predict_proba (pred_test .reshape (- 1 , 1 ))[:, 1 ]
65+ sigmoid_time = time .time () - start
66+ result = evaluate_calibrator (y_test , lr_calibrated , 'Sigmoid (sklearn)' )
67+ result ['fit_time' ] = sigmoid_time
68+ results .append (result )
69+
70+ # 2. Isotonic Regression
71+ start = time .time ()
72+ ir = IsotonicRegression (out_of_bounds = 'clip' )
73+ ir .fit (pred_cal , y_cal )
74+ ir_calibrated = ir .predict (pred_test )
75+ isotonic_time = time .time () - start
76+ result = evaluate_calibrator (y_test , ir_calibrated , 'Isotonic (sklearn)' )
77+ result ['fit_time' ] = isotonic_time
78+ results .append (result )
79+
80+ # 3. Linear Spline Logistic Regression (Splinator)
81+ start = time .time ()
82+ lslr = LinearSplineLogisticRegression (
83+ n_knots = n_knots ,
84+ monotonicity = "increasing" ,
85+ method = 'SLSQP' ,
86+ C = 100 ,
87+ two_stage_fitting_initial_size = min (2000 , len (y_cal ))
88+ )
89+ lslr .fit (pred_cal .reshape (- 1 , 1 ), y_cal )
90+ lslr_calibrated = lslr .predict (pred_test .reshape (- 1 , 1 ))
91+ lslr_time = time .time () - start
92+ result = evaluate_calibrator (y_test , lslr_calibrated , f'LSLR (n_knots={ n_knots } )' )
93+ result ['fit_time' ] = lslr_time
94+ results .append (result )
95+
96+ # Additional LSLR configurations
97+ for n_k in [20 , 100 ]:
98+ if n_k != n_knots :
99+ start = time .time ()
100+ lslr2 = LinearSplineLogisticRegression (
101+ n_knots = n_k ,
102+ monotonicity = "increasing" ,
103+ method = 'SLSQP' ,
104+ C = 100 ,
105+ two_stage_fitting_initial_size = min (2000 , len (y_cal ))
106+ )
107+ lslr2 .fit (pred_cal .reshape (- 1 , 1 ), y_cal )
108+ lslr2_calibrated = lslr2 .predict (pred_test .reshape (- 1 , 1 ))
109+ lslr2_time = time .time () - start
110+ result = evaluate_calibrator (y_test , lslr2_calibrated , f'LSLR (n_knots={ n_k } )' )
111+ result ['fit_time' ] = lslr2_time
112+ results .append (result )
113+
114+ return pd .DataFrame (results )
115+
116+ # Run comparisons with different dataset sizes
117+ results_list = []
118+
119+ for n_samples in [10000 , 50000 ]:
120+ for n_features in [10 , 30 ]:
121+ df = run_comparison (n_samples = n_samples , n_features = n_features , n_knots = 50 )
122+ df ['n_samples' ] = n_samples
123+ df ['n_features' ] = n_features
124+ results_list .append (df )
125+
126+ # Combine all results
127+ all_results = pd .concat (results_list , ignore_index = True )
128+
129+ # Display summary
130+ print ("\n " + "=" * 80 )
131+ print ("CALIBRATION COMPARISON RESULTS" )
132+ print ("=" * 80 )
133+ print ("\n Lower is better for: brier_score, log_loss, ece, spiegelhalter_z" )
134+ print ("\n Average performance across all experiments:" )
135+ summary = all_results .groupby ('method' )[['brier_score' , 'log_loss' , 'ece' , 'spiegelhalter_z' , 'fit_time' ]].mean ()
136+ print (summary .round (4 ))
137+
138+ print ("\n " + "=" * 80 )
139+ print ("RELATIVE PERFORMANCE (% improvement over Uncalibrated)" )
140+ print ("=" * 80 )
141+
142+ uncalibrated_scores = all_results [all_results ['method' ] == 'Uncalibrated' ][['brier_score' , 'log_loss' , 'ece' , 'spiegelhalter_z' ]].mean ()
143+ for method in ['Sigmoid (sklearn)' , 'Isotonic (sklearn)' , 'LSLR (n_knots=50)' ]:
144+ method_scores = all_results [all_results ['method' ] == method ][['brier_score' , 'log_loss' , 'ece' , 'spiegelhalter_z' ]].mean ()
145+ improvement = (uncalibrated_scores - method_scores ) / uncalibrated_scores * 100
146+ print (f"\n { method } :" )
147+ for metric in improvement .index :
148+ print (f" { metric } : { improvement [metric ]:.1f} % improvement" )
149+
150+ # Save detailed results
151+ all_results .to_csv ('calibration_comparison_results.csv' , index = False )
152+ print ("\n Detailed results saved to 'calibration_comparison_results.csv'" )
0 commit comments