ccao-data · Damonamajor · Sep 12, 2025 · Feb 26, 2025 · Feb 27, 2025 · Feb 27, 2025
@@ -7,6 +7,7 @@
 from .load_data import (
     ccao_sample,
     quintos_sample,
+    quintos_sample_with_tiebreaks,
 )
 from .metrics import (
     cod,

@@ -28,4 +28,4 @@ estimate,sale_price
 192959,235000
 180046,250000
 200240,279000
-211445,295000
+211445,295000
@@ -0,0 +1,31 @@
+sale_price,estimate,estimate_alt_sort_1,estimate_alt_sort_2
+32900,37299.37125,37299.37125,37299.37125
+36000,40165.89269,40165.89269,40165.89269
+54000,56317.4201,56317.4201,56317.4201
+64500,66183.77244,66183.77244,66183.77244
+68000,69486.97316,69486.97316,69486.97316
+70000,71514.52586,71514.52586,71514.52586
+74000,75338.28603,75338.28603,75338.28603
+80000,81035.95111,81035.95111,81035.95111
+84900,85672.85577,85672.85577,85672.85577
+89000,85021.0865,94088.93683,90046.33945
+89000,90046.33945,85021.0865,94088.93683
+89000,94088.93683,90046.33945,85021.0865
+105900,100227.0936,100227.0936,100227.0936
+109000,103156.7516,103156.7516,103156.7516
+115000,108290.1277,108290.1277,108290.1277
+124500,117098.7563,117098.7563,117098.7563
+129900,115346.9796,115346.9796,115346.9796
+135000,119678.4223,119678.4223,119678.4223
+149000,131630.9478,131630.9478,131630.9478
+155800,137321.2061,137321.2061,137321.2061
+163500,143973.5639,143973.5639,143973.5639
+175000,153571.8563,153571.8563,153571.8563
+179000,148456.8866,148456.8866,148456.8866
+185600,153488.3876,153488.3876,153488.3876
+199900,165039.8271,165039.8271,165039.8271
+215000,176939.5763,176939.5763,176939.5763
+235000,192959.3127,192959.3127,192959.3127
+250000,180046.1193,180046.1193,180046.1193
+279000,200240.2442,200240.2442,200240.2442
+295000,211445.4891,211445.4891,211445.4891
@@ -50,3 +50,28 @@ def quintos_sample() -> pd.DataFrame:
     source = files("assesspy").joinpath("data/quintos_sample.csv")
     with as_file(source) as file:
         return pd.read_csv(file)
+
+
+def quintos_sample_with_tiebreaks() -> pd.DataFrame:
+    """
+    Modified version of the Quintos sample of sales and estimated market values
+    that can be used to ensure that MKI/KI implementations are consistent when
+    some sales have the same sale price but different estimates.
+
+    :return:
+        A Pandas DataFrame with 30 observation and 4 variables:
+
+        ======================== =====================================================
+        **sale_price** (`float`)     Recorded sale price of this property
+        **estimate** (`float`)       Assessed fair market value
+        **estimate_alt_sort_1** (`float`)  Alternative FMV sort 1 for testing tiebreaks
+        **estimate_alt_sort_2** (`float`)  Alternative FMV sort 2 for testing tiebreaks
+        ======================== =====================================================
+
+    :rtype: pd.DataFrame
+    """
+    source = files("assesspy").joinpath(
+        "data/quintos_sample_with_tiebreaks.csv"
+    )
+    with as_file(source) as file:
+        return pd.read_csv(file)
@@ -211,8 +211,21 @@ def _calculate_gini(
         .reset_index(drop=True)
     )
     df = pd.concat([estimate, sale_price], axis=1)
-    # Mergesort is required for stable sort results
-    df.sort_values(by="sale_price", kind="mergesort", inplace=True)
+    # This Gini coefficient algorithm is sensitive to the order of the input
+    # observations: If multiple observations share the same sale price but have
+    # different estimates, the output coefficients will be different depending
+    # on which of the sales with identical prices gets ordered first in the
+    # input dataframe. To ensure a stable sort order, Quintos recommends
+    # sorting by ascending sale price and then by descending estimate to break
+    # any ties. This produces "worst case" MKI/KI statistics, but ensures those
+    # statistics are deterministic. See this issue for more discussion:
+    # https://github.com/ccao-data/assesspy/issues/33#issuecomment-3180632954
+    df.sort_values(
+        by=["sale_price", "estimate"],
+        ascending=[True, False],
+        kind="mergesort",
+        inplace=True,
+    )
     df.reset_index(drop=True, inplace=True)
     a_sorted, sp_sorted = df["estimate"], df["sale_price"]
     n: int = a_sorted.size

@@ -27,6 +27,17 @@ def quintos_data() -> tuple:
     return sample.estimate, sample.sale_price
 
 
+@pt.fixture(scope="session")
+def quintos_data_with_tiebreaks() -> tuple:
+    sample = ap.quintos_sample_with_tiebreaks()
+    return (
+        sample.sale_price,
+        sample.estimate,
+        sample.estimate_alt_sort_1,
+        sample.estimate_alt_sort_2,
+    )
+
+
 @pt.fixture(scope="session", params=["1_1", "1_4", "d_1", "d_2"])
 def iaao_data_name(request):
     return request.param

@@ -57,6 +57,26 @@ def test_metric_value_is_correct_iaao(
                 pt.approx(result, rel=0.02) == expected[iaao_data_name][metric]
             )
 
+    @pt.mark.parametrize("metric", ["mki", "ki"])
+    def test_mki_tiebreaks_consistent(
+        self, metric, quintos_data_with_tiebreaks
+    ):
+        sale_price, estimate, estimate_alt_sort_1, estimate_alt_sort_2 = (
+            quintos_data_with_tiebreaks
+        )
+        fn = getattr(ap, metric)
+
+        ref_val = fn(estimate, sale_price)
+
+        for idx, est in enumerate(
+            (estimate_alt_sort_1, estimate_alt_sort_2), start=1
+        ):
+            val = fn(est, sale_price)
+            assert val == ref_val, (
+                f"{metric.upper()} differs between estimate[0] and estimate_alt_sort_{idx}: "
+                f"{ref_val} vs {val}"
+            )
+
     def test_metric_has_numeric_output(self, metric_val):
         assert type(metric_val) is float
 

@@ -0,0 +1,5 @@
+================================
+Sample data from Quintos studies, modified to test sort order tiebreaks
+================================
+
+.. autofunction:: assesspy.quintos_sample_with_tiebreaks
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,4 +28,4 @@ estimate,sale_price @@
 ,235000
 ,250000
 ,279000
-,295000
+,295000