Extend load to optionally limit to current parquet files

ghukill · ghukill · commit 605dc0533430 · 2025-05-21T13:44:36.000-04:00
Why these changes are being introduced: With the creation of TIMDEXRunManager we now have the ability to identify parquet files associated with current ETL runs for all or a given source. This could be used to limit the TIMDEXDataset on load to only read from those parquet files. How this addresses that need: * Updates TIMDEXDataset.load() with a new 'current_records' flag that if True will use TIMDEXRunManager to get a list of parquet files to upload the dataset paths with. Side effects of this change: * None without explicit use. Eventually, could be utilized by contexts where only parquet files associated with current ETL runs are needed. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-494
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -24,7 +24,7 @@
 def test_dataset_init_success(location, expected_file_system, expected_source):
     timdex_dataset = TIMDEXDataset(location=location)
     assert isinstance(timdex_dataset.filesystem, expected_file_system)
-    assert timdex_dataset.source == expected_source
+    assert timdex_dataset.paths == expected_source
 
 
 def test_dataset_init_env_vars_set_config(monkeypatch, local_dataset_location):
@@ -79,8 +79,7 @@ def test_dataset_load_s3_sets_filesystem_and_dataset_success(
     timdex_dataset = TIMDEXDataset(location="s3://bucket/path/to/dataset")
     result = timdex_dataset.load()
 
-    mock_get_s3_fs.assert_called_once()
-    mock_pyarrow_ds.assert_called_once_with(
+    mock_pyarrow_ds.assert_called_with(
         "bucket/path/to/dataset",
         schema=timdex_dataset.schema,
         format="parquet",
@@ -137,6 +136,22 @@ def test_dataset_load_with_multi_nonpartition_filters_success(fixed_local_datase
     assert fixed_local_dataset.row_count == 1
 
 
+def test_dataset_load_current_records_all_sources_success(dataset_with_runs_location):
+    timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
+    timdex_dataset.load(current_records=True)
+
+    # 14 total parquet files, only 12 related to current runs
+    assert len(timdex_dataset.dataset.files) == 12
+
+
+def test_dataset_load_current_records_one_source_success(dataset_with_runs_location):
+    timdex_dataset = TIMDEXDataset(dataset_with_runs_location)
+    timdex_dataset.load(current_records=True, source="alma")
+
+    # 7 total parquet files for source, only 6 related to current runs
+    assert len(timdex_dataset.dataset.files) == 6
+
+
 def test_dataset_get_filtered_dataset_with_single_nonpartition_success(
     fixed_local_dataset,
 ):
diff --git a/tests/test_runs.py b/tests/test_runs.py
@@ -56,14 +56,27 @@ def test_timdex_run_manager_get_runs_df(timdex_run_manager):
     assert runs_df.source.value_counts().to_dict() == {"alma": 7, "dspace": 7}
 
 
+def test_timdex_run_manager_get_all_current_run_parquet_files_success(
+    timdex_run_manager,
+):
+    ordered_parquet_files = timdex_run_manager.get_current_parquet_files()
+
+    # assert 12 parquet files, despite being 14 total for ALL sources
+    # this represents the last full run and all daily since
+    assert len(ordered_parquet_files) == 12
+
+    # assert sorted reverse chronologically
+    assert "year=2025/month=01/day=01" in ordered_parquet_files[-1]
+
+
 def test_timdex_run_manager_get_source_current_run_parquet_files_success(
     timdex_run_manager,
 ):
     ordered_parquet_files = timdex_run_manager.get_current_source_parquet_files("alma")
 
-    # assert 6 parquet files, despite being 8 total for alma
+    # assert 6 parquet files, despite being 8 total for 'alma' source
     # this represents the last full run and all daily since
-    assert len(ordered_parquet_files)
+    assert len(ordered_parquet_files) == 6
 
     # assert sorted reverse chronologically
     assert "year=2025/month=01/day=05" in ordered_parquet_files[0]
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -20,6 +20,7 @@
 
 from timdex_dataset_api.config import configure_logger
 from timdex_dataset_api.exceptions import DatasetNotLoadedError
+from timdex_dataset_api.run import TIMDEXRunManager
 
 if TYPE_CHECKING:
     from timdex_dataset_api.record import DatasetRecord  # pragma: nocover
@@ -114,7 +115,7 @@ def __init__(
         self.location = location
         self.config = config or TIMDEXDatasetConfig()
 
-        self.filesystem, self.source = self.parse_location(self.location)
+        self.filesystem, self.paths = self.parse_location(self.location)
         self.dataset: ds.Dataset = None  # type: ignore[assignment]
         self.schema = TIMDEX_DATASET_SCHEMA
         self.partition_columns = TIMDEX_DATASET_PARTITION_COLUMNS
@@ -129,6 +130,8 @@ def row_count(self) -> int:
 
     def load(
         self,
+        *,
+        current_records: bool = False,
         **filters: Unpack[DatasetFilters],
     ) -> None:
         """Lazy load a pyarrow.dataset.Dataset and set to self.dataset.
@@ -152,14 +155,23 @@ def load(
         """
         start_time = time.perf_counter()
 
-        # load dataset
-        self.dataset = ds.dataset(
-            self.source,
-            schema=self.schema,
-            format="parquet",
-            partitioning="hive",
-            filesystem=self.filesystem,
-        )
+        # reset paths from original location before load
+        _, self.paths = self.parse_location(self.location)
+
+        # perform initial load of full dataset
+        self._load_pyarrow_dataset()
+
+        # if current_records flag set, limit to parquet files associated with current runs
+        if current_records:
+            timdex_run_manager = TIMDEXRunManager(timdex_dataset=self)
+
+            # if filters.source is set, further limit to only this source
+            source = filters.get("source")
+            if source:
+                self.paths = timdex_run_manager.get_current_source_parquet_files(source)
+            else:
+                self.paths = timdex_run_manager.get_current_parquet_files()
+            self._load_pyarrow_dataset()
 
         # filter dataset
         self.dataset = self._get_filtered_dataset(**filters)
@@ -169,6 +181,16 @@ def load(
             f"{round(time.perf_counter()-start_time, 2)}s"
         )
 
+    def _load_pyarrow_dataset(self) -> None:
+        """Load the pyarrow dataset per local filesystem and paths attributes."""
+        self.dataset = ds.dataset(
+            self.paths,
+            schema=self.schema,
+            format="parquet",
+            partitioning="hive",
+            filesystem=self.filesystem,
+        )
+
     def _get_filtered_dataset(
         self,
         **filters: Unpack[DatasetFilters],
@@ -345,7 +367,8 @@ def write(
         start_time = time.perf_counter()
         self._written_files = []
 
-        if isinstance(self.source, list):
+        dataset_filesystem, dataset_path = self.parse_location(self.location)
+        if isinstance(dataset_path, list):
             raise TypeError(
                 "Dataset location must be the root of a single dataset for writing"
             )
@@ -354,10 +377,10 @@ def write(
 
         ds.write_dataset(
             record_batches_iter,
-            base_dir=self.source,
+            base_dir=dataset_path,
             basename_template="%s-{i}.parquet" % (str(uuid.uuid4())),  # noqa: UP031
             existing_data_behavior="overwrite_or_ignore",
-            filesystem=self.filesystem,
+            filesystem=dataset_filesystem,
             file_visitor=lambda written_file: self._written_files.append(written_file),  # type: ignore[arg-type]
             format="parquet",
             max_open_files=500,
diff --git a/timdex_dataset_api/run.py b/timdex_dataset_api/run.py
@@ -115,6 +115,17 @@ def get_current_source_parquet_files(self, source: str) -> list[str]:
 
         return ordered_parquet_files
 
+    def get_current_parquet_files(self) -> list[str]:
+        """Get reverse chronological list of current parquet files for ALL sources."""
+        runs_df = self.get_runs_metadata()  # run metadata is cached for future calls
+        sources = list(runs_df.source.unique())
+
+        source_parquet_files = []
+        for source in sources:
+            source_parquet_files.extend(self.get_current_source_parquet_files(source))
+
+        return source_parquet_files
+
     def _get_parquet_files_run_metadata(self, max_workers: int = 250) -> pd.DataFrame:
         """Retrieve run metadata from parquet file(s) in dataset.
 
@@ -166,8 +177,9 @@ def _parse_run_metadata_from_parquet_file(self, parquet_filepath: str) -> dict:
         """
         parquet_file = pq.ParquetFile(
             parquet_filepath,
-            filesystem=self.timdex_dataset.filesystem,  # type: ignore[union-attr]
+            filesystem=self.timdex_dataset.filesystem,
         )
+
         file_meta = parquet_file.metadata.to_dict()
         num_rows = file_meta["num_rows"]
         columns_meta = file_meta["row_groups"][0]["columns"]  # type: ignore[typeddict-item]