Add run_timestamp column to dataset

ghukill · ghukill · commit e006b1c7ad52 · 2025-05-29T09:55:32.000-04:00
Why these changes are being introduced: During work on yielding only "current" records from the dataset, where ordering of the ETL runs in the dataset is critical, it was determined that more time granularity was needed for each ETL run. Currently we store the YYYY-MM-DD for each run, but if multiple runs occur on the same day, we are unable to order them more granularly. How this addresses that need: * Adds new run_timestamp to parquet dataset schema * Timestamp is minted before any runs are written, and then used for each row in the ETL run Side effects of this change: * All TIMDEX components that use this library for reading and writing will need a terraform rebuild to pick up this change. Otherwise, they need no further modification. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-496
diff --git a/Pipfile b/Pipfile
@@ -25,6 +25,7 @@ pytest = "*"
 ruff = "*"
 setuptools = "*"
 pip-audit = "*"
+pytest-freezegun = "*"
 
 [requires]
 python_version = "3.12"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -153,3 +153,57 @@ def dataset_with_runs_location(tmp_path) -> str:
 @pytest.fixture
 def local_dataset_with_runs(dataset_with_runs_location) -> TIMDEXDataset:
     return TIMDEXDataset(dataset_with_runs_location)
+
+
+@pytest.fixture
+def dataset_with_same_day_runs(tmp_path) -> TIMDEXDataset:
+    """Dataset fixture where a single source had multiple runs on the same day.
+
+    After these runs, we'd expect 70 records in Opensearch:
+        - most recent full run "run-2" established a 75 record base
+        - runs "run-3" and "run-4" just modified records; no record count change
+        - run "run-5" deleted 5 records
+
+    If the order of full runs 1 & 2 are not handled correctly, we'd see an incorrect
+    baseline of 100 records.
+
+    If the order of daily runs 4 & 5 are not handled correctly, we'd see 75 records
+    because the deletes would happen before the index just recreated the records.
+    """
+    location = str(tmp_path / "dataset_with_same_day_runs")
+    os.mkdir(location)
+
+    timdex_dataset = TIMDEXDataset(location)
+
+    run_params = []
+
+    # Simulate two "full" runs where "run-2" should establish the baseline.
+    # Simulate daily runs, multiple per day sometimes, where deletes from "run-5" should
+    # be represented.
+    run_params.extend(
+        [
+            (100, "alma", "2025-01-01", "full", "index", "run-1"),
+            (75, "alma", "2025-01-01", "full", "index", "run-2"),
+            (10, "alma", "2025-01-01", "daily", "index", "run-3"),
+            (20, "alma", "2025-01-02", "daily", "index", "run-4"),
+            (5, "alma", "2025-01-02", "daily", "delete", "run-5"),
+        ]
+    )
+
+    for params in run_params:
+        num_records, source, run_date, run_type, action, run_id = params
+        records = generate_sample_records(
+            num_records,
+            timdex_record_id_prefix=source,
+            source=source,
+            run_date=run_date,
+            run_type=run_type,
+            action=action,
+            run_id=run_id,
+        )
+        timdex_dataset.write(records)
+
+    # reload after writes
+    timdex_dataset.load()
+
+    return timdex_dataset
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1,7 +1,7 @@
-# ruff: noqa: D205, S105, S106, SLF001, PD901, PLR2004
+# ruff: noqa: D205, D209, S105, S106, SLF001, PD901, PLR2004
 
 import os
-from datetime import date
+from datetime import UTC, date, datetime
 from unittest.mock import MagicMock, patch
 
 import pyarrow as pa
@@ -463,3 +463,50 @@ def test_dataset_current_records_index_filtering_accurate_records_yielded(
         "alma:23",
         "alma:24",
     ]
+
+
+@pytest.mark.freeze_time("2025-05-22 01:23:45.567890")
+def test_dataset_write_includes_minted_run_timestamp(
+    dataset_with_same_day_runs,
+):
+    # assert TIMDEXDataset.write() applies current time as run_timestamp
+    row_dict = next(dataset_with_same_day_runs.read_dicts_iter())
+    assert "run_timestamp" in row_dict
+    assert row_dict["run_timestamp"] == datetime(
+        2025,
+        5,
+        22,
+        1,
+        23,
+        45,
+        567890,
+        tzinfo=UTC,
+    )
+
+    # assert same time is used for entire batch
+    df = dataset_with_same_day_runs.read_dataframe()
+    assert len(list(df.run_timestamp.unique())) == 1
+
+
+def test_dataset_load_current_records_gets_correct_same_day_full_run(
+    dataset_with_same_day_runs,
+):
+    """Two full runs were performed on the same day, but 'run-2' was performed most
+    recently.  current_records=True should discover the more recent of the two 'run-2',
+    not 'run-1'."""
+    dataset_with_same_day_runs.load(current_records=True, run_type="full")
+    df = dataset_with_same_day_runs.read_dataframe()
+
+    assert list(df.run_id.unique()) == ["run-2"]
+
+
+def test_dataset_load_current_records_gets_correct_same_day_daily_runs_ordering(
+    dataset_with_same_day_runs,
+):
+    """Two runs were performed on 2025-01-02, but the most recent records should be from
+    run 'run-5' which are action='delete', not 'run-4' with action='index'."""
+    dataset_with_same_day_runs.load(current_records=True, run_type="daily")
+    first_record = next(dataset_with_same_day_runs.read_dicts_iter())
+
+    assert first_record["run_id"] == "run-5"
+    assert first_record["action"] == "delete"
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -41,6 +41,7 @@
         pa.field("year", pa.string()),
         pa.field("month", pa.string()),
         pa.field("day", pa.string()),
+        pa.field("run_timestamp", pa.timestamp("us", tz="UTC")),
     )
 )
 
@@ -62,6 +63,7 @@ class DatasetFilters(TypedDict, total=False):
     year: str | None
     month: str | None
     day: str | None
+    run_timestamp: str | datetime | None
 
 
 @dataclass
@@ -112,15 +114,19 @@ def __init__(
             location (str | list[str]): Local filesystem path or an S3 URI to
                 a parquet dataset. For partitioned datasets, set to the base directory.
         """
-        self.location = location
         self.config = config or TIMDEXDatasetConfig()
+        self.location = location
 
+        # pyarrow dataset
         self.filesystem, self.paths = self.parse_location(self.location)
         self.dataset: ds.Dataset = None  # type: ignore[assignment]
         self.schema = TIMDEX_DATASET_SCHEMA
         self.partition_columns = TIMDEX_DATASET_PARTITION_COLUMNS
+
+        # writing
         self._written_files: list[ds.WrittenFile] = None  # type: ignore[assignment]
 
+        # reading
         self._current_records: bool = False
         self._current_records_dataset: ds.Dataset = None  # type: ignore[assignment]
 
@@ -405,26 +411,31 @@ def write(
         return self._written_files  # type: ignore[return-value]
 
     def create_record_batches(
-        self,
-        records_iter: Iterator["DatasetRecord"],
+        self, records_iter: Iterator["DatasetRecord"]
     ) -> Iterator[pa.RecordBatch]:
         """Yield pyarrow.RecordBatches for writing.
 
         This method expects an iterator of DatasetRecord instances.
 
-        Each DatasetRecord is validated and serialized to a dictionary before added to a
-        pyarrow.RecordBatch for writing.
+        Each DatasetRecord is serialized to a dictionary, any column data shared by all
+        rows is added to the record, and then added to a pyarrow.RecordBatch for writing.
 
         Args:
             - records_iter: Iterator of DatasetRecord instances
         """
+        run_timestamp = datetime.now(UTC)
         for i, record_batch in enumerate(
             itertools.batched(records_iter, self.config.write_batch_size)
         ):
-            batch = pa.RecordBatch.from_pylist(
-                [record.to_dict() for record in record_batch]
-            )
-            logger.debug(f"Yielding batch {i+1} for dataset writing.")
+            record_dicts = [
+                {
+                    **record.to_dict(),
+                    "run_timestamp": run_timestamp,
+                }
+                for record in record_batch
+            ]
+            batch = pa.RecordBatch.from_pylist(record_dicts)
+            logger.debug(f"Yielding batch {i + 1} for dataset writing.")
             yield batch
 
     def log_write_statistics(self, start_time: float) -> None:
diff --git a/timdex_dataset_api/run.py b/timdex_dataset_api/run.py
@@ -52,6 +52,7 @@ def get_runs_metadata(self, *, refresh: bool = False) -> pd.DataFrame:
                     "source": "first",
                     "run_date": "first",
                     "run_type": "first",
+                    "run_timestamp": "first",
                     "num_rows": "sum",
                     "filename": list,
                 }
@@ -65,9 +66,9 @@ def get_runs_metadata(self, *, refresh: bool = False) -> pd.DataFrame:
             lambda x: len(x)
         )
 
-        # sort by run date and source
+        # sort by run_timestamp (more granularity than run_date) and source
         grouped_runs_df = grouped_runs_df.sort_values(
-            ["run_date", "source"], ascending=False
+            ["run_timestamp", "source"], ascending=False
         )
 
         # cache the result
@@ -185,12 +186,14 @@ def _parse_run_metadata_from_parquet_file(self, parquet_filepath: str) -> dict:
         run_date = columns_meta[4]["statistics"]["max"]
         run_type = columns_meta[5]["statistics"]["max"]
         run_id = columns_meta[7]["statistics"]["max"]
+        run_timestamp = columns_meta[9]["statistics"]["max"]
 
         return {
             "source": source,
             "run_date": run_date,
             "run_type": run_type,
             "run_id": run_id,
+            "run_timestamp": run_timestamp,
             "num_rows": num_rows,
             "filename": parquet_filepath,
         }