From bbd4a15921e5518993dce57b4ea24c5037db8726 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Mon, 7 Jul 2025 20:44:45 +0530 Subject: [PATCH 1/5] Remove cached properties before updating table scans --- pyiceberg/table/__init__.py | 15 ++++++++++++++- tests/integration/test_reads.py | 13 +++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 1dc7a29cc1..49a1255bf7 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1619,6 +1619,13 @@ def _parse_row_filter(expr: Union[str, BooleanExpression]) -> BooleanExpression: return parser.parse(expr) if isinstance(expr, str) else expr +def _get_cached_property_names(cls: Any) -> Set[str]: + """Return a set of all cached property names defined on the class.""" + from inspect import getmembers + + return {name for name, attr in getmembers(cls) if isinstance(attr, cached_property)} + + S = TypeVar("S", bound="TableScan", covariant=True) @@ -1691,7 +1698,13 @@ def to_polars(self) -> pl.DataFrame: ... def update(self: S, **overrides: Any) -> S: """Create a copy of this table scan with updated fields.""" - return type(self)(**{**self.__dict__, **overrides}) + data = {**self.__dict__, **overrides} + + # Cached properties are also stored in the __dict__, so must be removed + for cached_prop in _get_cached_property_names(self.__class__): + data.pop(cached_prop, None) + + return type(self)(**data) def use_ref(self: S, name: str) -> S: if self.snapshot_id: diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index b417a43616..3aac579011 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -1024,3 +1024,16 @@ def test_scan_with_datetime(catalog: Catalog) -> None: df = table.scan(row_filter=LessThan("datetime", yesterday)).to_pandas() assert len(df) == 0 + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_filter_after_arrow_scan(catalog: Catalog) -> None: + identifier = "test_partitioned_by_hours" + table = catalog.load_table(f"default.{identifier}") + + scan = table.scan() + assert len(scan.to_arrow()) > 0 + + scan = scan.filter("ts >= '2023-03-05T00:00:00+00:00'") + assert len(scan.to_arrow()) > 0 From bdb42696f0fc4d0815856f16c38bcd6b27b2ceaf Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Thu, 10 Jul 2025 19:04:25 +0530 Subject: [PATCH 2/5] try construct method --- pyiceberg/table/__init__.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 49a1255bf7..7c5de92eb4 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1619,13 +1619,6 @@ def _parse_row_filter(expr: Union[str, BooleanExpression]) -> BooleanExpression: return parser.parse(expr) if isinstance(expr, str) else expr -def _get_cached_property_names(cls: Any) -> Set[str]: - """Return a set of all cached property names defined on the class.""" - from inspect import getmembers - - return {name for name, attr in getmembers(cls) if isinstance(attr, cached_property)} - - S = TypeVar("S", bound="TableScan", covariant=True) @@ -1696,15 +1689,12 @@ def to_pandas(self, **kwargs: Any) -> pd.DataFrame: ... @abstractmethod def to_polars(self) -> pl.DataFrame: ... + @abstractmethod def update(self: S, **overrides: Any) -> S: """Create a copy of this table scan with updated fields.""" - data = {**self.__dict__, **overrides} - - # Cached properties are also stored in the __dict__, so must be removed - for cached_prop in _get_cached_property_names(self.__class__): - data.pop(cached_prop, None) - return type(self)(**data) + def _construct(self: S, **kwargs: Any) -> S: + return type(self)(**kwargs) def use_ref(self: S, name: str) -> S: if self.snapshot_id: @@ -1814,6 +1804,19 @@ def _match_deletes_to_data_file(data_entry: ManifestEntry, positional_delete_ent class DataScan(TableScan): + def update(self: DataScan, **overrides: Any) -> DataScan: + return self._construct( + table_metadata=self.table_metadata, + io=self.io, + row_filter=self.row_filter, + selected_fields=self.selected_fields, + case_sensitive=self.case_sensitive, + snapshot_id=self.snapshot_id, + options=self.options, + limit=self.limit, + **overrides, + ) + def _build_partition_projection(self, spec_id: int) -> BooleanExpression: project = inclusive_projection(self.table_metadata.schema(), self.table_metadata.specs()[spec_id], self.case_sensitive) return project(self.row_filter) From 30db4d11ccc685c4992470f733a6662bfb2b8437 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Thu, 10 Jul 2025 19:10:13 +0530 Subject: [PATCH 3/5] Introduce arguments in subclass --- pyiceberg/table/__init__.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 7c5de92eb4..b4476b70e6 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1689,12 +1689,13 @@ def to_pandas(self, **kwargs: Any) -> pd.DataFrame: ... @abstractmethod def to_polars(self) -> pl.DataFrame: ... + @property @abstractmethod + def _arguments(self) -> dict[str, Any]: ... + def update(self: S, **overrides: Any) -> S: """Create a copy of this table scan with updated fields.""" - - def _construct(self: S, **kwargs: Any) -> S: - return type(self)(**kwargs) + return type(self)(**{**self._arguments, **overrides}) def use_ref(self: S, name: str) -> S: if self.snapshot_id: @@ -1804,18 +1805,18 @@ def _match_deletes_to_data_file(data_entry: ManifestEntry, positional_delete_ent class DataScan(TableScan): - def update(self: DataScan, **overrides: Any) -> DataScan: - return self._construct( - table_metadata=self.table_metadata, - io=self.io, - row_filter=self.row_filter, - selected_fields=self.selected_fields, - case_sensitive=self.case_sensitive, - snapshot_id=self.snapshot_id, - options=self.options, - limit=self.limit, - **overrides, - ) + @property + def _arguments(self) -> dict[str, Any]: + return { + "table_metadata": self.table_metadata, + "io": self.io, + "row_filter": self.row_filter, + "selected_fields": self.selected_fields, + "case_sensitive": self.case_sensitive, + "snapshot_id": self.snapshot_id, + "options": self.options, + "limit": self.limit, + } def _build_partition_projection(self, spec_id: int) -> BooleanExpression: project = inclusive_projection(self.table_metadata.schema(), self.table_metadata.specs()[spec_id], self.case_sensitive) From c4b793e8bb9a866189d68d06744afa55ab7b6f69 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Fri, 11 Jul 2025 00:21:59 +0530 Subject: [PATCH 4/5] Use method that subclasses override --- pyiceberg/table/__init__.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index b4476b70e6..f36ddbce1c 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1689,14 +1689,24 @@ def to_pandas(self, **kwargs: Any) -> pd.DataFrame: ... @abstractmethod def to_polars(self) -> pl.DataFrame: ... - @property - @abstractmethod - def _arguments(self) -> dict[str, Any]: ... - def update(self: S, **overrides: Any) -> S: """Create a copy of this table scan with updated fields.""" return type(self)(**{**self._arguments, **overrides}) + @property + def _arguments(self) -> dict[str, Any]: + """Return the arguments for TableScan creation. Subclasses should override this to include their constructor arguments.""" + return { + "table_metadata": self.table_metadata, + "io": self.io, + "row_filter": self.row_filter, + "selected_fields": self.selected_fields, + "case_sensitive": self.case_sensitive, + "snapshot_id": self.snapshot_id, + "options": self.options, + "limit": self.limit, + } + def use_ref(self: S, name: str) -> S: if self.snapshot_id: raise ValueError(f"Cannot override ref, already set snapshot id={self.snapshot_id}") From bb0521c7c7c08c26cb7a62be7786b6801a210512 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Fri, 11 Jul 2025 00:23:37 +0530 Subject: [PATCH 5/5] Fix --- pyiceberg/table/__init__.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index f36ddbce1c..837a6cd51d 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1695,7 +1695,7 @@ def update(self: S, **overrides: Any) -> S: @property def _arguments(self) -> dict[str, Any]: - """Return the arguments for TableScan creation. Subclasses should override this to include their constructor arguments.""" + """Return the arguments for TableScan creation. Subclasses with additional constructor arguments should override this to include them.""" return { "table_metadata": self.table_metadata, "io": self.io, @@ -1815,19 +1815,6 @@ def _match_deletes_to_data_file(data_entry: ManifestEntry, positional_delete_ent class DataScan(TableScan): - @property - def _arguments(self) -> dict[str, Any]: - return { - "table_metadata": self.table_metadata, - "io": self.io, - "row_filter": self.row_filter, - "selected_fields": self.selected_fields, - "case_sensitive": self.case_sensitive, - "snapshot_id": self.snapshot_id, - "options": self.options, - "limit": self.limit, - } - def _build_partition_projection(self, spec_id: int) -> BooleanExpression: project = inclusive_projection(self.table_metadata.schema(), self.table_metadata.specs()[spec_id], self.case_sensitive) return project(self.row_filter)