Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
7d048ed
Try adding second sorting for mki
Damonamajor Feb 26, 2025
bf585fd
Push test file
Damonamajor Feb 27, 2025
c60b5fb
remove second mergesort
Damonamajor Feb 27, 2025
8278bac
update sort
Damonamajor Aug 29, 2025
9de2cb9
update quintos sample
Damonamajor Sep 2, 2025
20451f2
Add test for mki ki matching
Damonamajor Sep 2, 2025
e97c413
comma based seed
Damonamajor Sep 2, 2025
d83c541
change to comma separated
Damonamajor Sep 2, 2025
d8bd9cf
precommit
Damonamajor Sep 2, 2025
7e518e0
add matching test
Damonamajor Sep 2, 2025
3432b95
precommit
Damonamajor Sep 2, 2025
0fe2336
Remove excess code
Damonamajor Sep 2, 2025
6908bb2
Delete assesspy/test.py
Damonamajor Sep 2, 2025
50082a9
rename test more accurately
Damonamajor Sep 3, 2025
d491933
update name again
Damonamajor Sep 3, 2025
8477716
Update assesspy/tests/test_metrics.py
Damonamajor Sep 3, 2025
e49fb16
Update assesspy/data/quintos_sample.csv
Damonamajor Sep 3, 2025
4fb55e9
use both csv files
Damonamajor Sep 3, 2025
f5be991
update test_metrics
Damonamajor Sep 3, 2025
fe5b445
Include documentation
Damonamajor Sep 4, 2025
e67f915
lintr
Damonamajor Sep 4, 2025
34e0b14
set as a fixture
Damonamajor Sep 4, 2025
e7d25ac
lintr
Damonamajor Sep 4, 2025
1c8918c
make one fixture
Damonamajor Sep 4, 2025
0c5692b
make one test
Damonamajor Sep 4, 2025
3fcc03b
re-add stray delete
Damonamajor Sep 4, 2025
455085d
remove unneeded pandas
Damonamajor Sep 4, 2025
aa7dd08
Add parametized test
Damonamajor Sep 4, 2025
c18b8ec
rename
Damonamajor Sep 4, 2025
a3b96a2
Add to conftest
Damonamajor Sep 4, 2025
31dc6eb
update conftest
Damonamajor Sep 10, 2025
6574b2a
lintr
Damonamajor Sep 10, 2025
db9823d
record commenting
Damonamajor Sep 11, 2025
53f52a0
more commenting
Damonamajor Sep 11, 2025
24a07de
move everything to test
Damonamajor Sep 11, 2025
bde3a77
Update load_data.py
Damonamajor Sep 11, 2025
132d803
Update test_metrics.py
Damonamajor Sep 11, 2025
5861e4b
Update assesspy/load_data.py
Damonamajor Sep 12, 2025
7ab91cb
Update assesspy/load_data.py
Damonamajor Sep 12, 2025
cb51fc4
Update assesspy/metrics.py
Damonamajor Sep 12, 2025
8bcdf41
Update docs/source/quintos_sample_with_tiebreaks.rst
Damonamajor Sep 12, 2025
8d4d995
lintr
Damonamajor Sep 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions assesspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .load_data import (
ccao_sample,
quintos_sample,
quintos_sample_with_tiebreaks,
)
from .metrics import (
cod,
Expand Down
2 changes: 1 addition & 1 deletion assesspy/data/quintos_sample.csv
Comment thread
Damonamajor marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ estimate,sale_price
192959,235000
180046,250000
200240,279000
211445,295000
211445,295000
31 changes: 31 additions & 0 deletions assesspy/data/quintos_sample_with_tiebreaks.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
sale_price,estimate,estimate_alt_sort_1,estimate_alt_sort_2
32900,37299.37125,37299.37125,37299.37125
36000,40165.89269,40165.89269,40165.89269
54000,56317.4201,56317.4201,56317.4201
64500,66183.77244,66183.77244,66183.77244
68000,69486.97316,69486.97316,69486.97316
70000,71514.52586,71514.52586,71514.52586
74000,75338.28603,75338.28603,75338.28603
80000,81035.95111,81035.95111,81035.95111
84900,85672.85577,85672.85577,85672.85577
89000,85021.0865,94088.93683,90046.33945
89000,90046.33945,85021.0865,94088.93683
89000,94088.93683,90046.33945,85021.0865
105900,100227.0936,100227.0936,100227.0936
109000,103156.7516,103156.7516,103156.7516
115000,108290.1277,108290.1277,108290.1277
124500,117098.7563,117098.7563,117098.7563
129900,115346.9796,115346.9796,115346.9796
135000,119678.4223,119678.4223,119678.4223
149000,131630.9478,131630.9478,131630.9478
155800,137321.2061,137321.2061,137321.2061
163500,143973.5639,143973.5639,143973.5639
175000,153571.8563,153571.8563,153571.8563
179000,148456.8866,148456.8866,148456.8866
185600,153488.3876,153488.3876,153488.3876
199900,165039.8271,165039.8271,165039.8271
215000,176939.5763,176939.5763,176939.5763
235000,192959.3127,192959.3127,192959.3127
250000,180046.1193,180046.1193,180046.1193
279000,200240.2442,200240.2442,200240.2442
295000,211445.4891,211445.4891,211445.4891
25 changes: 25 additions & 0 deletions assesspy/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,28 @@ def quintos_sample() -> pd.DataFrame:
source = files("assesspy").joinpath("data/quintos_sample.csv")
with as_file(source) as file:
return pd.read_csv(file)


def quintos_sample_with_tiebreaks() -> pd.DataFrame:
"""
Modified version of the Quintos sample of sales and estimated market values
that can be used to ensure that MKI/KI implementations are consistent when
some sales have the same sale price but different estimates.

:return:
A Pandas DataFrame with 30 observation and 4 variables:

======================== =====================================================
**sale_price** (`float`) Recorded sale price of this property
**estimate** (`float`) Assessed fair market value
**estimate_alt_sort_1** (`float`) Alternative FMV sort 1 for testing tiebreaks
**estimate_alt_sort_2** (`float`) Alternative FMV sort 2 for testing tiebreaks
======================== =====================================================

:rtype: pd.DataFrame
"""
source = files("assesspy").joinpath(
"data/quintos_sample_with_tiebreaks.csv"
)
with as_file(source) as file:
return pd.read_csv(file)
17 changes: 15 additions & 2 deletions assesspy/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,21 @@ def _calculate_gini(
.reset_index(drop=True)
)
df = pd.concat([estimate, sale_price], axis=1)
# Mergesort is required for stable sort results
df.sort_values(by="sale_price", kind="mergesort", inplace=True)
# This Gini coefficient algorithm is sensitive to the order of the input
# observations: If multiple observations share the same sale price but have
# different estimates, the output coefficients will be different depending
# on which of the sales with identical prices gets ordered first in the
# input dataframe. To ensure a stable sort order, Quintos recommends
# sorting by ascending sale price and then by descending estimate to break
# any ties. This produces "worst case" MKI/KI statistics, but ensures those
# statistics are deterministic. See this issue for more discussion:
# https://github.com/ccao-data/assesspy/issues/33#issuecomment-3180632954
df.sort_values(
by=["sale_price", "estimate"],
ascending=[True, False],
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uses False for ascending order for estimate in accordance with our external guidance.

After a lot of deliberation, we decided the best way forward is to assume the "worst case scenario" in terms of MKI/KI metrics by sorting the data first by the ascending actual value (sale price) and then by the descending predicted value (modeled result). Not saying this has to be your solution, but wanted to share our thinking if helpful.

kind="mergesort",
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Question, optional] I wonder if this kwarg is still necessary? Per the pandas docs, kind is only used when sorting on a single column, but now we're sorting on two columns:

Choice of sorting algorithm. See also numpy.sort() for more information. mergesort and stable are the only stable algorithms. For DataFrames, this option is only applied when sorting on a single column or label.

I'm agnostic as to whether we should leave the kwarg in or take it out -- it doesn't seem to make anything worse to leave it in, and it could provide a layer of defensiveness preventing us from accidentally reintroducing an unstable sort if we ever decide to switch back to sorting on a single column -- but I'd be interested to see if the tests still pass when we take it out.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assumed that it wouldn't affect anything. The only reason I left it in was if we ever wanted to do something with the dataset externally, it would remain the same. Maybe we wanted to look at class once sorted by MKI. That's not really a good example, but I could imagine something along these lines.

I expect it to pass even without it.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine leaving it in!

inplace=True,
)
df.reset_index(drop=True, inplace=True)
a_sorted, sp_sorted = df["estimate"], df["sale_price"]
n: int = a_sorted.size
Expand Down
11 changes: 11 additions & 0 deletions assesspy/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@ def quintos_data() -> tuple:
return sample.estimate, sample.sale_price


@pt.fixture(scope="session")
def quintos_data_with_tiebreaks() -> tuple:
sample = ap.quintos_sample_with_tiebreaks()
return (
sample.sale_price,
sample.estimate,
sample.estimate_alt_sort_1,
sample.estimate_alt_sort_2,
)


@pt.fixture(scope="session", params=["1_1", "1_4", "d_1", "d_2"])
def iaao_data_name(request):
return request.param
Expand Down
20 changes: 20 additions & 0 deletions assesspy/tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,26 @@ def test_metric_value_is_correct_iaao(
pt.approx(result, rel=0.02) == expected[iaao_data_name][metric]
)

@pt.mark.parametrize("metric", ["mki", "ki"])
def test_mki_tiebreaks_consistent(
self, metric, quintos_data_with_tiebreaks
):
sale_price, estimate, estimate_alt_sort_1, estimate_alt_sort_2 = (
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can index, but this feels easier to interpret later.

quintos_data_with_tiebreaks
)
fn = getattr(ap, metric)

ref_val = fn(estimate, sale_price)

for idx, est in enumerate(
(estimate_alt_sort_1, estimate_alt_sort_2), start=1
):
val = fn(est, sale_price)
assert val == ref_val, (
f"{metric.upper()} differs between estimate[0] and estimate_alt_sort_{idx}: "
f"{ref_val} vs {val}"
)

def test_metric_has_numeric_output(self, metric_val):
assert type(metric_val) is float

Expand Down
5 changes: 5 additions & 0 deletions docs/source/quintos_sample_with_tiebreaks.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
================================
Sample data from Quintos studies, modified to test sort order tiebreaks
================================

.. autofunction:: assesspy.quintos_sample_with_tiebreaks
Loading