From b1bb4c6b6bf6b09a017b9f8fe944f10766eea1e6 Mon Sep 17 00:00:00 2001 From: turvoy Date: Tue, 27 Sep 2022 15:39:12 +0200 Subject: [PATCH 01/10] stronger classifier for detection test --- .vscode/settings.json | 11 +++++++++++ sdmetrics/single_table/__init__.py | 3 ++- sdmetrics/single_table/detection/__init__.py | 5 +++-- sdmetrics/single_table/detection/sklearn.py | 17 +++++++++++++++++ 4 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..9780500e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,11 @@ +{ + "python.testing.unittestArgs": [ + "-v", + "-s", + "./tests", + "-p", + "*test.py" + ], + "python.testing.pytestEnabled": false, + "python.testing.unittestEnabled": true +} \ No newline at end of file diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py index 35704626..4990660d 100644 --- a/sdmetrics/single_table/__init__.py +++ b/sdmetrics/single_table/__init__.py @@ -6,7 +6,7 @@ from sdmetrics.single_table.bayesian_network import BNLikelihood, BNLogLikelihood from sdmetrics.single_table.detection.base import DetectionMetric from sdmetrics.single_table.detection.sklearn import ( - LogisticDetection, ScikitLearnClassifierDetectionMetric, SVCDetection) + LogisticDetection, ScikitLearnClassifierDetectionMetric, SVCDetection, GradientBoostingDetection) from sdmetrics.single_table.efficacy.base import MLEfficacyMetric from sdmetrics.single_table.efficacy.binary import ( BinaryAdaBoostClassifier, BinaryDecisionTreeClassifier, BinaryEfficacyMetric, @@ -47,6 +47,7 @@ 'DetectionMetric', 'LogisticDetection', 'SVCDetection', + 'GradientBoostingDetection', 'ScikitLearnClassifierDetectionMetric', 'MLEfficacyMetric', 'BinaryEfficacyMetric', diff --git a/sdmetrics/single_table/detection/__init__.py b/sdmetrics/single_table/detection/__init__.py index b987a119..9f4340b9 100644 --- a/sdmetrics/single_table/detection/__init__.py +++ b/sdmetrics/single_table/detection/__init__.py @@ -1,8 +1,9 @@ """Machine Learning Detection metrics for single table datasets.""" -from sdmetrics.single_table.detection.sklearn import LogisticDetection, SVCDetection +from sdmetrics.single_table.detection.sklearn import LogisticDetection, SVCDetection, GradientBoostingDetection __all__ = [ 'LogisticDetection', - 'SVCDetection' + 'SVCDetection', + 'GradientBoostingDetection' ] diff --git a/sdmetrics/single_table/detection/sklearn.py b/sdmetrics/single_table/detection/sklearn.py index a33a33d9..be656c4c 100644 --- a/sdmetrics/single_table/detection/sklearn.py +++ b/sdmetrics/single_table/detection/sklearn.py @@ -2,6 +2,7 @@ from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import GradientBoostingClassifier from sklearn.pipeline import Pipeline from sklearn.preprocessing import RobustScaler from sklearn.svm import SVC @@ -67,3 +68,19 @@ class SVCDetection(ScikitLearnClassifierDetectionMetric): @staticmethod def _get_classifier(): return SVC(probability=True, gamma='scale') + + +class GradientBoostingDetection(ScikitLearnClassifierDetectionMetric): + """ScikitLearnClassifierDetectionMetric based on a GradientBoostingClassifier. + + This metric builds a GradientBoostingClassifier Classifier that learns to tell the synthetic + data apart from the real data, which later on is evaluated using Cross Validation. + + The output of the metric is one minus the average ROC AUC score obtained. + """ + + name = 'SVC Detection' + + @staticmethod + def _get_classifier(): + return GradientBoostingClassifier() \ No newline at end of file From 263e43eca6aaf3b4ac608b04c5d401a797132204 Mon Sep 17 00:00:00 2001 From: turvoy Date: Tue, 27 Sep 2022 17:25:38 +0200 Subject: [PATCH 02/10] make all tests --- sdmetrics/single_table/__init__.py | 3 ++- sdmetrics/single_table/detection/__init__.py | 7 ++++--- sdmetrics/single_table/detection/sklearn.py | 4 ++-- tests/integration/single_table/test_single_table.py | 4 +++- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py index 4990660d..ab6c76c0 100644 --- a/sdmetrics/single_table/__init__.py +++ b/sdmetrics/single_table/__init__.py @@ -6,7 +6,8 @@ from sdmetrics.single_table.bayesian_network import BNLikelihood, BNLogLikelihood from sdmetrics.single_table.detection.base import DetectionMetric from sdmetrics.single_table.detection.sklearn import ( - LogisticDetection, ScikitLearnClassifierDetectionMetric, SVCDetection, GradientBoostingDetection) + GradientBoostingDetection, LogisticDetection, ScikitLearnClassifierDetectionMetric, + SVCDetection) from sdmetrics.single_table.efficacy.base import MLEfficacyMetric from sdmetrics.single_table.efficacy.binary import ( BinaryAdaBoostClassifier, BinaryDecisionTreeClassifier, BinaryEfficacyMetric, diff --git a/sdmetrics/single_table/detection/__init__.py b/sdmetrics/single_table/detection/__init__.py index 9f4340b9..8450948b 100644 --- a/sdmetrics/single_table/detection/__init__.py +++ b/sdmetrics/single_table/detection/__init__.py @@ -1,9 +1,10 @@ """Machine Learning Detection metrics for single table datasets.""" -from sdmetrics.single_table.detection.sklearn import LogisticDetection, SVCDetection, GradientBoostingDetection +from sdmetrics.single_table.detection.sklearn import ( + GradientBoostingDetection, LogisticDetection, SVCDetection) __all__ = [ + 'GradientBoostingDetection', 'LogisticDetection', - 'SVCDetection', - 'GradientBoostingDetection' + 'SVCDetection' ] diff --git a/sdmetrics/single_table/detection/sklearn.py b/sdmetrics/single_table/detection/sklearn.py index be656c4c..49eecafc 100644 --- a/sdmetrics/single_table/detection/sklearn.py +++ b/sdmetrics/single_table/detection/sklearn.py @@ -1,8 +1,8 @@ """scikit-learn based DetectionMetrics for single table datasets.""" +from sklearn.ensemble import GradientBoostingClassifier from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression -from sklearn.ensemble import GradientBoostingClassifier from sklearn.pipeline import Pipeline from sklearn.preprocessing import RobustScaler from sklearn.svm import SVC @@ -83,4 +83,4 @@ class GradientBoostingDetection(ScikitLearnClassifierDetectionMetric): @staticmethod def _get_classifier(): - return GradientBoostingClassifier() \ No newline at end of file + return GradientBoostingClassifier() diff --git a/tests/integration/single_table/test_single_table.py b/tests/integration/single_table/test_single_table.py index 7ecd45b3..2f880887 100644 --- a/tests/integration/single_table/test_single_table.py +++ b/tests/integration/single_table/test_single_table.py @@ -7,7 +7,8 @@ from sdmetrics.goal import Goal from sdmetrics.single_table.base import SingleTableMetric from sdmetrics.single_table.bayesian_network import BNLikelihood, BNLogLikelihood -from sdmetrics.single_table.detection import LogisticDetection, SVCDetection +from sdmetrics.single_table.detection import ( + GradientBoostingDetection, LogisticDetection, SVCDetection) from sdmetrics.single_table.multi_column_pairs import ( ContingencySimilarity, ContinuousKLDivergence, DiscreteKLDivergence) from sdmetrics.single_table.multi_single_column import ( @@ -17,6 +18,7 @@ METRICS = [ CSTest, KSComplement, + GradientBoostingDetection, LogisticDetection, SVCDetection, ContinuousKLDivergence, From f8384198d89639d6428c4ec23fe36e13e6891046 Mon Sep 17 00:00:00 2001 From: turvoy Date: Tue, 27 Sep 2022 17:31:18 +0200 Subject: [PATCH 03/10] remove vscode dir --- .vscode/settings.json | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 9780500e..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "python.testing.unittestArgs": [ - "-v", - "-s", - "./tests", - "-p", - "*test.py" - ], - "python.testing.pytestEnabled": false, - "python.testing.unittestEnabled": true -} \ No newline at end of file From 4c5eae2ad0609058ad2bb15373f74fd55feb37b4 Mon Sep 17 00:00:00 2001 From: Tanguy Urvoy Date: Tue, 27 Sep 2022 17:33:22 +0200 Subject: [PATCH 04/10] Delete settings.json oops --- .vscode/settings.json | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 9780500e..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "python.testing.unittestArgs": [ - "-v", - "-s", - "./tests", - "-p", - "*test.py" - ], - "python.testing.pytestEnabled": false, - "python.testing.unittestEnabled": true -} \ No newline at end of file From 52115f31186c5e1809c1757a40c26e609bf2e132 Mon Sep 17 00:00:00 2001 From: turvoy Date: Tue, 27 Sep 2022 17:34:11 +0200 Subject: [PATCH 05/10] ignore vcode stuff --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 8c6107d1..b1ea75c2 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,5 @@ ENV/ # OS Files .DS_Store +# vcode stuff +.vcode/ From 818be31ba8442369bec331bb0caad7b9290c78b1 Mon Sep 17 00:00:00 2001 From: turvoy Date: Wed, 28 Sep 2022 17:55:28 +0200 Subject: [PATCH 06/10] fixed incorrect name for GB detection --- .vscode/settings.json | 11 +++++++++++ sdmetrics/single_table/detection/sklearn.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..9780500e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,11 @@ +{ + "python.testing.unittestArgs": [ + "-v", + "-s", + "./tests", + "-p", + "*test.py" + ], + "python.testing.pytestEnabled": false, + "python.testing.unittestEnabled": true +} \ No newline at end of file diff --git a/sdmetrics/single_table/detection/sklearn.py b/sdmetrics/single_table/detection/sklearn.py index 49eecafc..38f1aa7e 100644 --- a/sdmetrics/single_table/detection/sklearn.py +++ b/sdmetrics/single_table/detection/sklearn.py @@ -79,7 +79,7 @@ class GradientBoostingDetection(ScikitLearnClassifierDetectionMetric): The output of the metric is one minus the average ROC AUC score obtained. """ - name = 'SVC Detection' + name = 'GradientBoosting Detection' @staticmethod def _get_classifier(): From f9a33a2c075b997cf8abedf9ac86fbee07c1c482 Mon Sep 17 00:00:00 2001 From: Tanguy Urvoy Date: Wed, 28 Sep 2022 18:05:42 +0200 Subject: [PATCH 07/10] Delete settings.json --- .vscode/settings.json | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 9780500e..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "python.testing.unittestArgs": [ - "-v", - "-s", - "./tests", - "-p", - "*test.py" - ], - "python.testing.pytestEnabled": false, - "python.testing.unittestEnabled": true -} \ No newline at end of file From afc2b259e917b79db6240fba10573c62a3732272 Mon Sep 17 00:00:00 2001 From: turvoy Date: Thu, 29 Sep 2022 13:54:54 +0200 Subject: [PATCH 08/10] Change the way detection score is computed: use mean raw auc as raw score and move 2-max(0.5,s)*2 transformation to normalization method. --- sdmetrics/single_table/detection/base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sdmetrics/single_table/detection/base.py b/sdmetrics/single_table/detection/base.py index e3bc1295..4b44637f 100644 --- a/sdmetrics/single_table/detection/base.py +++ b/sdmetrics/single_table/detection/base.py @@ -50,7 +50,7 @@ def compute(cls, real_data, synthetic_data, metadata=None): This builds a Machine Learning Classifier that learns to tell the synthetic data apart from the real data, which later on is evaluated using Cross Validation. - The output of the metric is one minus the average ROC AUC score obtained. + The output of the metric is the average ROC AUC score obtained. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): @@ -85,9 +85,10 @@ def compute(cls, real_data, synthetic_data, metadata=None): y_pred = cls._fit_predict(X[train_index], y[train_index], X[test_index]) roc_auc = roc_auc_score(y[test_index], y_pred) - scores.append(max(0.5, roc_auc) * 2 - 1) + #scores.append(max(0.5, roc_auc) * 2 - 1) + scores.append(roc_auc) - return 1 - np.mean(scores) + return np.mean(scores) except ValueError as err: raise IncomputableMetricError(f'DetectionMetric: Unable to be fit with error {err}') @@ -103,4 +104,5 @@ def normalize(cls, raw_score): float: Simply returns `raw_score`. """ - return super().normalize(raw_score) + score = 2 - 2 * max(0.5,raw_score) + return super().normalize(score) From d20003f203ac44e530137eb13a063e8a3ad0e31c Mon Sep 17 00:00:00 2001 From: turvoy Date: Thu, 29 Sep 2022 14:10:41 +0200 Subject: [PATCH 09/10] Change the way detection score is computed. updated code comments. --- sdmetrics/single_table/detection/base.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sdmetrics/single_table/detection/base.py b/sdmetrics/single_table/detection/base.py index 4b44637f..c84191c6 100644 --- a/sdmetrics/single_table/detection/base.py +++ b/sdmetrics/single_table/detection/base.py @@ -85,8 +85,7 @@ def compute(cls, real_data, synthetic_data, metadata=None): y_pred = cls._fit_predict(X[train_index], y[train_index], X[test_index]) roc_auc = roc_auc_score(y[test_index], y_pred) - #scores.append(max(0.5, roc_auc) * 2 - 1) - scores.append(roc_auc) + scores.append(max(0.5,roc_auc)) return np.mean(scores) except ValueError as err: @@ -94,7 +93,7 @@ def compute(cls, real_data, synthetic_data, metadata=None): @classmethod def normalize(cls, raw_score): - """Return the `raw_score` as is, since it is already normalized. + """Return the `raw_score`normalized to be higher-is-better in [0,1] Args: raw_score (float): @@ -102,7 +101,8 @@ def normalize(cls, raw_score): Returns: float: - Simply returns `raw_score`. + Returns `2*(1-raw_score)`. """ - score = 2 - 2 * max(0.5,raw_score) + assert raw_score >= 0.5, "raw auc score should be in [0.5,1]" + score = 2 * (1 - raw_score) return super().normalize(score) From fb6a328e5a450a108b920c78a923b00d3ce962c2 Mon Sep 17 00:00:00 2001 From: Tanguy Urvoy Date: Tue, 22 Aug 2023 17:30:24 +0200 Subject: [PATCH 10/10] Update base.py --- sdmetrics/single_table/detection/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdmetrics/single_table/detection/base.py b/sdmetrics/single_table/detection/base.py index c84191c6..c27dc1b9 100644 --- a/sdmetrics/single_table/detection/base.py +++ b/sdmetrics/single_table/detection/base.py @@ -34,7 +34,7 @@ class DetectionMetric(SingleTableMetric): """ name = 'SingleTable Detection' - goal = Goal.MAXIMIZE + goal = Goal.MINIMIZE min_value = 0.0 max_value = 1.0