From 3a75c25ca12dc5eac6169fb429aa2313d121588d Mon Sep 17 00:00:00 2001 From: Bogdan Romenskii Date: Thu, 30 Oct 2025 23:07:16 +0100 Subject: [PATCH 1/7] Check that source is not a directory or a list in read_table --- python/pyarrow/parquet/core.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 24cb586c82b..de5b7ac34d1 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1887,10 +1887,23 @@ def read_table(source, *, columns=None, use_threads=True, "the 'schema' argument is not supported when the " "pyarrow.dataset module is not available" ) + if isinstance(source, list): + raise ValueError( + "the 'source' argument cannot be a list of files " + "when the pyarrow.dataset is not available" + ) + filesystem, path = _resolve_filesystem_and_path(source, filesystem) if filesystem is not None: - source = filesystem.open_input_file(path) - # TODO test that source is not a directory or a list + try: + source = filesystem.open_input_file(path) + except (OSError, FileNotFoundError) as e: + raise ValueError( + "the 'source' argument should be " + "an existing .parquet file and not a directory, " + "when the pyarrow.dataset is not available" + ) from e + dataset = ParquetFile( source, read_dictionary=read_dictionary, binary_type=binary_type, From c420c0f51a5c6ac37a1199de6f720f9daa402d8a Mon Sep 17 00:00:00 2001 From: Bogdan Romenskii Date: Thu, 30 Oct 2025 23:51:51 +0100 Subject: [PATCH 2/7] Add test for read_table if pyarrow.dataset is unavailable --- python/pyarrow/tests/parquet/test_basic.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 591bcffc1ac..735a4bd636e 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -16,6 +16,7 @@ # under the License. import os +import sys from collections import OrderedDict import io import warnings @@ -993,3 +994,14 @@ def test_checksum_write_to_dataset(tempdir): # checksum verification enabled raises an exception with pytest.raises(OSError, match="CRC checksum verification"): _ = pq.read_table(corrupted_file_path, page_checksum_verification=True) + + +@pytest.mark.parametrize( + "source", ["/tmp/", ["/tmp/file1.parquet", "/tmp/file2.parquet"]]) +def test_read_table_raises_value_error_when_ds_is_unavailable( + monkeypatch, source): + # GH-47728 + monkeypatch.setitem(sys.modules, "pyarrow.dataset", None) + + with pytest.raises(ValueError): + pq.read_table(source=source) From d301489ccd15908c93d0a742aaf9689aa529e67c Mon Sep 17 00:00:00 2001 From: Bogdan Romenskii Date: Fri, 31 Oct 2025 00:11:24 +0100 Subject: [PATCH 3/7] Fix linter issue --- python/pyarrow/tests/parquet/test_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 735a4bd636e..a88579fcc67 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -999,7 +999,7 @@ def test_checksum_write_to_dataset(tempdir): @pytest.mark.parametrize( "source", ["/tmp/", ["/tmp/file1.parquet", "/tmp/file2.parquet"]]) def test_read_table_raises_value_error_when_ds_is_unavailable( - monkeypatch, source): + monkeypatch, source): # GH-47728 monkeypatch.setitem(sys.modules, "pyarrow.dataset", None) From 4f219f381f60a9a0be8cc2ddd294208d6b9786a4 Mon Sep 17 00:00:00 2001 From: Bogdan Romenskii Date: Sun, 2 Nov 2025 13:14:22 +0100 Subject: [PATCH 4/7] Specify the match for pytest.raises --- python/pyarrow/tests/parquet/test_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index a88579fcc67..9bdbb50c041 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -1003,5 +1003,5 @@ def test_read_table_raises_value_error_when_ds_is_unavailable( # GH-47728 monkeypatch.setitem(sys.modules, "pyarrow.dataset", None) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="the 'source' argument"): pq.read_table(source=source) From cea6bb75a0440038143c98bc6044b4ebe8448d8c Mon Sep 17 00:00:00 2001 From: Bogdan Romenskii Date: Sun, 2 Nov 2025 13:15:44 +0100 Subject: [PATCH 5/7] Check if the source is a file directly --- python/pyarrow/parquet/core.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index de5b7ac34d1..5a2752eba61 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1890,19 +1890,19 @@ def read_table(source, *, columns=None, use_threads=True, if isinstance(source, list): raise ValueError( "the 'source' argument cannot be a list of files " - "when the pyarrow.dataset is not available" + "when the pyarrow.dataset module is not available" ) filesystem, path = _resolve_filesystem_and_path(source, filesystem) if filesystem is not None: - try: - source = filesystem.open_input_file(path) - except (OSError, FileNotFoundError) as e: + if not filesystem.get_file_info(path).is_file: raise ValueError( "the 'source' argument should be " "an existing .parquet file and not a directory, " - "when the pyarrow.dataset is not available" - ) from e + "when the pyarrow.dataset module is not available" + ) + + source = filesystem.open_input_file(path) dataset = ParquetFile( source, read_dictionary=read_dictionary, From c7c44f94e425835f3d5b7146ed7a64bc1ca54e36 Mon Sep 17 00:00:00 2001 From: Bogdan Romenskii Date: Tue, 4 Nov 2025 21:05:54 +0100 Subject: [PATCH 6/7] Update test_read_table_without_dataset test --- python/pyarrow/tests/parquet/test_basic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 9bdbb50c041..b588528db55 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -186,8 +186,7 @@ def __init__(self, *args, **kwargs): pq.read_table(path, partitioning=['week', 'color']) with pytest.raises(ValueError, match="the 'schema' argument"): pq.read_table(path, schema=table.schema) - # Error message varies depending on OS - with pytest.raises(OSError): + with pytest.raises(ValueError, match="the 'source' argument"): pq.read_table(tempdir) result = pq.read_table(path) assert result == table From d4017331cf4507de0f38cc63c86dd47d4f5c095d Mon Sep 17 00:00:00 2001 From: Bogdan Romenskii Date: Wed, 12 Nov 2025 22:09:48 +0100 Subject: [PATCH 7/7] Relint the files --- python/pyarrow/parquet/core.py | 2 +- python/pyarrow/tests/parquet/test_basic.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 5a2752eba61..5f62a3fc4f1 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1898,7 +1898,7 @@ def read_table(source, *, columns=None, use_threads=True, if not filesystem.get_file_info(path).is_file: raise ValueError( "the 'source' argument should be " - "an existing .parquet file and not a directory, " + "an existing parquet file and not a directory " "when the pyarrow.dataset module is not available" ) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index b588528db55..3b991fdd57a 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -997,8 +997,7 @@ def test_checksum_write_to_dataset(tempdir): @pytest.mark.parametrize( "source", ["/tmp/", ["/tmp/file1.parquet", "/tmp/file2.parquet"]]) -def test_read_table_raises_value_error_when_ds_is_unavailable( - monkeypatch, source): +def test_read_table_raises_value_error_when_ds_is_unavailable(monkeypatch, source): # GH-47728 monkeypatch.setitem(sys.modules, "pyarrow.dataset", None)