Remove unnecessary get_path helper (#102)

sid-kap · web-flow · commit acb9d0dab414 · 2025-08-23T13:47:45.000-04:00
Since #80, we always load the data from a local dir. So this helper is unnecessary.
diff --git a/python/housing_data/build_places.py b/python/housing_data/build_places.py
@@ -293,7 +293,7 @@ def get_name_spelling(places_df: pd.DataFrame) -> pd.Series:
 
 
 def load_places(
-    data_repo_path: Optional[Path], counties_population_df: pd.DataFrame = None
+    data_repo_path: Path, counties_population_df: pd.DataFrame = None
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     raw_places_df = pd.concat(
         [
@@ -310,7 +310,7 @@ def load_places(
     raw_places_df.to_parquet(PUBLIC_DIR / "places_annual_without_population.parquet")
 
     place_populations_df = place_population.get_place_population_estimates(
-        data_path=data_repo_path / PLACE_POPULATION_DIR if data_repo_path else None
+        data_path=data_repo_path / PLACE_POPULATION_DIR
     )
     place_populations_df = fix_nyc_boroughs_population(
         place_populations_df, counties_population_df
diff --git a/python/housing_data/county_population.py b/python/housing_data/county_population.py
@@ -4,7 +4,6 @@
 import pandas as pd
 import us
 from housing_data.build_data_utils import impute_2025_population
-from housing_data.data_loading_helpers import get_url_text
 from housing_data.fips_crosswalk import load_fips_crosswalk
 
 
@@ -119,11 +118,7 @@ def get_county_fips_crosswalk(data_repo_path: Path) -> pd.DataFrame:
 
 
 def get_county_populations_1990s(data_path: Path) -> pd.DataFrame:
-    table_text = get_url_text(
-        "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/counties/totals/99c8_00.txt",
-        data_path,
-        encoding="latin_1",
-    )
+    table_text = (data_path / "99c8_00.txt").read_text(encoding="latin_1")
 
     table_text = table_text[: table_text.index("Block 2")].strip()
 
diff --git a/python/housing_data/data_loading_helpers.py b/python/housing_data/data_loading_helpers.py
@@ -34,10 +34,3 @@ def get_url_text(
     else:
         web_url = os.path.join(web_prefix, common_path)
         return requests.get(web_url).text
-
-
-def get_path(url: str, data_path: Optional[Path]) -> str:
-    if data_path is not None:
-        return str(Path(data_path, Path(url).name))
-    else:
-        return url
diff --git a/python/housing_data/place_population.py b/python/housing_data/place_population.py
@@ -5,15 +5,10 @@
 import numpy as np
 import pandas as pd
 from housing_data.build_data_utils import impute_2025_population
-from housing_data.data_loading_helpers import get_path, get_url_text
 
 
-def _get_places_crosswalk_df(data_path: Optional[Path] = None) -> pd.DataFrame:
-    df = pd.read_fwf(
-        get_path(
-            "https://www2.census.gov/geo/tiger/PREVGENZ/pl/us_places.txt", data_path
-        )
-    )
+def _get_places_crosswalk_df(data_path: Path) -> pd.DataFrame:
+    df = pd.read_fwf(data_path / "us_places.txt")
 
     df["State Code"] = df["CENSUS"] // 10000
     df["Place Code"] = df["CENSUS"] % 10000
@@ -79,7 +74,7 @@ def get_unincorporated_places_populations_1980() -> pd.DataFrame:
     return remainder_df
 
 
-def get_place_populations_1980(data_path: Optional[Path]) -> pd.DataFrame:
+def get_place_populations_1980(data_path: Path) -> pd.DataFrame:
     # Assuming this is run from `python/`
     # For the header row, use the nice descriptive names that IPUMS provides rather than the code names
     df = pd.read_csv("../raw_data/nhgis0015_ds104_1980_place_070.csv", header=1)
@@ -147,12 +142,8 @@ def get_place_populations_1980(data_path: Optional[Path]) -> pd.DataFrame:
     return df
 
 
-def _load_raw_place_populations_1990s(data_path: Optional[Path]) -> pd.DataFrame:
-    tables = get_url_text(
-        "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/"
-        "2000-subcounties-evaluation-estimates/sc2000f_us.txt",
-        data_path,
-    ).split("\f")
+def _load_raw_place_populations_1990s(data_path: Path) -> pd.DataFrame:
+    tables = (data_path / "sc2000f_us.txt").read_text().split("\f")
 
     common_cols = [
         "Block",
@@ -278,7 +269,7 @@ def remove_duplicate_cities(df: pd.DataFrame) -> pd.DataFrame:
     return df[~place_state_tuples.isin(dupe_cities)]
 
 
-def get_place_populations_1990s(data_path: Optional[Path]) -> pd.DataFrame:
+def get_place_populations_1990s(data_path: Path) -> pd.DataFrame:
     combined_df = _load_raw_place_populations_1990s(data_path)
 
     city_rows = (
@@ -396,14 +387,8 @@ def _melt_df(
     )
 
 
-def get_place_populations_2000s(data_path: Optional[Path]) -> pd.DataFrame:
-    df = pd.read_csv(
-        get_path(
-            "https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/cities/sub-est00int.csv",
-            data_path,
-        ),
-        encoding="latin_1",
-    )
+def get_place_populations_2000s(data_path: Path) -> pd.DataFrame:
+    df = pd.read_csv(data_path / "sub-est00int.csv", encoding="latin_1")
     return _melt_df(
         df,
         years=list(range(2000, 2011)),
@@ -412,26 +397,14 @@ def get_place_populations_2000s(data_path: Optional[Path]) -> pd.DataFrame:
     )
 
 
-def get_place_populations_2010s(data_path: Optional[Path]) -> pd.DataFrame:
-    df = pd.read_csv(
-        get_path(
-            "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/cities/SUB-EST2020_ALL.csv",
-            data_path,
-        ),
-        encoding="latin_1",
-    )
+def get_place_populations_2010s(data_path: Path) -> pd.DataFrame:
+    df = pd.read_csv(data_path / "SUB-EST2020_ALL.csv", encoding="latin_1")
 
     return _melt_df(df, years=list(range(2010, 2021)))
 
 
-def get_place_populations_2020s(data_path: Optional[Path]) -> pd.DataFrame:
-    df = pd.read_csv(
-        get_path(
-            "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/cities/sub-est2024.csv",
-            data_path,
-        ),
-        encoding="latin_1",
-    )
+def get_place_populations_2020s(data_path: Path) -> pd.DataFrame:
+    df = pd.read_csv(data_path / "sub-est2024.csv", encoding="latin_1")
     df = _melt_df(df, years=list(range(2020, 2025)))
     df = impute_2025_population(df)
     return df
@@ -482,7 +455,7 @@ def interpolate_1980s_populations(
     return interp_df
 
 
-def get_place_population_estimates(data_path: Optional[Path] = None) -> pd.DataFrame:
+def get_place_population_estimates(data_path: Path) -> pd.DataFrame:
     """
     Returns a DataFrame with the columns:
     - state_code (int)
diff --git a/python/housing_data/state_population.py b/python/housing_data/state_population.py
@@ -4,7 +4,6 @@
 import pandas as pd
 import us
 from housing_data.build_data_utils import impute_2025_population
-from housing_data.data_loading_helpers import get_path, get_url_text
 
 DIVISIONS = {
     "New England": [
@@ -75,10 +74,7 @@ def _line_to_cols(row: str) -> list[str]:
 
 
 def get_state_populations_1980s(data_path: Path) -> pd.DataFrame:
-    states_80s_text = get_url_text(
-        "https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st8090ts.txt",
-        data_path,
-    )
+    states_80s_text = (data_path / "st8090ts.txt").read_text()
     handle = StringIO(states_80s_text)
 
     for _ in range(10):
@@ -168,10 +164,7 @@ def get_state_populations_1990s(data_path: Path) -> pd.DataFrame:
 
 def get_state_populations_2000s(data_path: Path) -> pd.DataFrame:
     df = pd.read_excel(
-        get_path(
-            "https://www2.census.gov/programs-surveys/popest/tables/2000-2010/intercensal/state/st-est00int-01.xls",
-            data_path,
-        ),
+        data_path / "st-est00int-01.xls",
         skiprows=3,
         skipfooter=8,
     )
@@ -206,12 +199,7 @@ def _melt_df(df: pd.DataFrame, years: list[int]) -> pd.DataFrame:
 
 
 def get_state_populations_2010s(data_path: Path) -> pd.DataFrame:
-    df = pd.read_csv(
-        get_path(
-            "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/state/totals/nst-est2020-alldata.csv",
-            data_path,
-        )
-    )
+    df = pd.read_csv(data_path / "nst-est2020-alldata.csv")
 
     return _melt_df(df, list(range(2010, 2020)))
 

Original file line number	Diff line number	Diff line change
`@@ -293,7 +293,7 @@ def get_name_spelling(places_df: pd.DataFrame) -> pd.Series:`
`293`	`293`
`294`	`294`
`295`	`295`	`def load_places(`
`296`		`- data_repo_path: Optional[Path], counties_population_df: pd.DataFrame = None`
	`296`	`+ data_repo_path: Path, counties_population_df: pd.DataFrame = None`
`297`	`297`	`) -> tuple[pd.DataFrame, pd.DataFrame]:`
`298`	`298`	`raw_places_df = pd.concat(`
`299`	`299`	`[`
`@@ -310,7 +310,7 @@ def load_places(`
`310`	`310`	`raw_places_df.to_parquet(PUBLIC_DIR / "places_annual_without_population.parquet")`
`311`	`311`
`312`	`312`	`place_populations_df = place_population.get_place_population_estimates(`
`313`		`- data_path=data_repo_path / PLACE_POPULATION_DIR if data_repo_path else None`
	`313`	`+ data_path=data_repo_path / PLACE_POPULATION_DIR`
`314`	`314`	`)`
`315`	`315`	`place_populations_df = fix_nyc_boroughs_population(`
`316`	`316`	`place_populations_df, counties_population_df`