diff --git a/assesspy/__init__.py b/assesspy/__init__.py index 8d8652a..a6245ba 100644 --- a/assesspy/__init__.py +++ b/assesspy/__init__.py @@ -7,6 +7,7 @@ from .load_data import ( ccao_sample, quintos_sample, + quintos_sample_with_tiebreaks, ) from .metrics import ( cod, diff --git a/assesspy/data/quintos_sample.csv b/assesspy/data/quintos_sample.csv index 6c370c4..0db90c9 100644 --- a/assesspy/data/quintos_sample.csv +++ b/assesspy/data/quintos_sample.csv @@ -28,4 +28,4 @@ estimate,sale_price 192959,235000 180046,250000 200240,279000 -211445,295000 +211445,295000 \ No newline at end of file diff --git a/assesspy/data/quintos_sample_with_tiebreaks.csv b/assesspy/data/quintos_sample_with_tiebreaks.csv new file mode 100644 index 0000000..607fc46 --- /dev/null +++ b/assesspy/data/quintos_sample_with_tiebreaks.csv @@ -0,0 +1,31 @@ +sale_price,estimate,estimate_alt_sort_1,estimate_alt_sort_2 +32900,37299.37125,37299.37125,37299.37125 +36000,40165.89269,40165.89269,40165.89269 +54000,56317.4201,56317.4201,56317.4201 +64500,66183.77244,66183.77244,66183.77244 +68000,69486.97316,69486.97316,69486.97316 +70000,71514.52586,71514.52586,71514.52586 +74000,75338.28603,75338.28603,75338.28603 +80000,81035.95111,81035.95111,81035.95111 +84900,85672.85577,85672.85577,85672.85577 +89000,85021.0865,94088.93683,90046.33945 +89000,90046.33945,85021.0865,94088.93683 +89000,94088.93683,90046.33945,85021.0865 +105900,100227.0936,100227.0936,100227.0936 +109000,103156.7516,103156.7516,103156.7516 +115000,108290.1277,108290.1277,108290.1277 +124500,117098.7563,117098.7563,117098.7563 +129900,115346.9796,115346.9796,115346.9796 +135000,119678.4223,119678.4223,119678.4223 +149000,131630.9478,131630.9478,131630.9478 +155800,137321.2061,137321.2061,137321.2061 +163500,143973.5639,143973.5639,143973.5639 +175000,153571.8563,153571.8563,153571.8563 +179000,148456.8866,148456.8866,148456.8866 +185600,153488.3876,153488.3876,153488.3876 +199900,165039.8271,165039.8271,165039.8271 +215000,176939.5763,176939.5763,176939.5763 +235000,192959.3127,192959.3127,192959.3127 +250000,180046.1193,180046.1193,180046.1193 +279000,200240.2442,200240.2442,200240.2442 +295000,211445.4891,211445.4891,211445.4891 \ No newline at end of file diff --git a/assesspy/load_data.py b/assesspy/load_data.py index dcca6c7..9a569be 100644 --- a/assesspy/load_data.py +++ b/assesspy/load_data.py @@ -50,3 +50,28 @@ def quintos_sample() -> pd.DataFrame: source = files("assesspy").joinpath("data/quintos_sample.csv") with as_file(source) as file: return pd.read_csv(file) + + +def quintos_sample_with_tiebreaks() -> pd.DataFrame: + """ + Modified version of the Quintos sample of sales and estimated market values + that can be used to ensure that MKI/KI implementations are consistent when + some sales have the same sale price but different estimates. + + :return: + A Pandas DataFrame with 30 observation and 4 variables: + + ======================== ===================================================== + **sale_price** (`float`) Recorded sale price of this property + **estimate** (`float`) Assessed fair market value + **estimate_alt_sort_1** (`float`) Alternative FMV sort 1 for testing tiebreaks + **estimate_alt_sort_2** (`float`) Alternative FMV sort 2 for testing tiebreaks + ======================== ===================================================== + + :rtype: pd.DataFrame + """ + source = files("assesspy").joinpath( + "data/quintos_sample_with_tiebreaks.csv" + ) + with as_file(source) as file: + return pd.read_csv(file) diff --git a/assesspy/metrics.py b/assesspy/metrics.py index 0635796..43d21db 100644 --- a/assesspy/metrics.py +++ b/assesspy/metrics.py @@ -211,8 +211,21 @@ def _calculate_gini( .reset_index(drop=True) ) df = pd.concat([estimate, sale_price], axis=1) - # Mergesort is required for stable sort results - df.sort_values(by="sale_price", kind="mergesort", inplace=True) + # This Gini coefficient algorithm is sensitive to the order of the input + # observations: If multiple observations share the same sale price but have + # different estimates, the output coefficients will be different depending + # on which of the sales with identical prices gets ordered first in the + # input dataframe. To ensure a stable sort order, Quintos recommends + # sorting by ascending sale price and then by descending estimate to break + # any ties. This produces "worst case" MKI/KI statistics, but ensures those + # statistics are deterministic. See this issue for more discussion: + # https://github.com/ccao-data/assesspy/issues/33#issuecomment-3180632954 + df.sort_values( + by=["sale_price", "estimate"], + ascending=[True, False], + kind="mergesort", + inplace=True, + ) df.reset_index(drop=True, inplace=True) a_sorted, sp_sorted = df["estimate"], df["sale_price"] n: int = a_sorted.size diff --git a/assesspy/tests/conftest.py b/assesspy/tests/conftest.py index 725e6d4..61f335a 100644 --- a/assesspy/tests/conftest.py +++ b/assesspy/tests/conftest.py @@ -27,6 +27,17 @@ def quintos_data() -> tuple: return sample.estimate, sample.sale_price +@pt.fixture(scope="session") +def quintos_data_with_tiebreaks() -> tuple: + sample = ap.quintos_sample_with_tiebreaks() + return ( + sample.sale_price, + sample.estimate, + sample.estimate_alt_sort_1, + sample.estimate_alt_sort_2, + ) + + @pt.fixture(scope="session", params=["1_1", "1_4", "d_1", "d_2"]) def iaao_data_name(request): return request.param diff --git a/assesspy/tests/test_metrics.py b/assesspy/tests/test_metrics.py index 50603c6..1fde9b8 100644 --- a/assesspy/tests/test_metrics.py +++ b/assesspy/tests/test_metrics.py @@ -57,6 +57,26 @@ def test_metric_value_is_correct_iaao( pt.approx(result, rel=0.02) == expected[iaao_data_name][metric] ) + @pt.mark.parametrize("metric", ["mki", "ki"]) + def test_mki_tiebreaks_consistent( + self, metric, quintos_data_with_tiebreaks + ): + sale_price, estimate, estimate_alt_sort_1, estimate_alt_sort_2 = ( + quintos_data_with_tiebreaks + ) + fn = getattr(ap, metric) + + ref_val = fn(estimate, sale_price) + + for idx, est in enumerate( + (estimate_alt_sort_1, estimate_alt_sort_2), start=1 + ): + val = fn(est, sale_price) + assert val == ref_val, ( + f"{metric.upper()} differs between estimate[0] and estimate_alt_sort_{idx}: " + f"{ref_val} vs {val}" + ) + def test_metric_has_numeric_output(self, metric_val): assert type(metric_val) is float diff --git a/docs/source/quintos_sample_with_tiebreaks.rst b/docs/source/quintos_sample_with_tiebreaks.rst new file mode 100644 index 0000000..dac9530 --- /dev/null +++ b/docs/source/quintos_sample_with_tiebreaks.rst @@ -0,0 +1,5 @@ +================================ +Sample data from Quintos studies, modified to test sort order tiebreaks +================================ + +.. autofunction:: assesspy.quintos_sample_with_tiebreaks