Skip to content

Commit 46b033f

Browse files
authored
GH-47728: [Python] Check the source argument in parquet.read_table (#48008)
### Rationale for this change See #47728. Check `source` argument in `pyarrow.parquet.read_table` if `pyarrow.dataset` is not available. ### What changes are included in this PR? Check the `source` argument, raise `ValueError` if the `source` argument is either a list of `.parquet` files or a directory. ### Are these changes tested? Yes ### Are there any user-facing changes? No In case if the `source` argument is a directory, I decided not to check it directly, but to catch the exceptions coming from the `fs.open_input_file`, since it already checks for it, and add extra exception on top of the stack that explains the actual reason. * GitHub Issue: #47728 Authored-by: Bogdan Romenskii <[email protected]> Signed-off-by: Raúl Cumplido <[email protected]>
1 parent f5b3fc7 commit 46b033f

File tree

2 files changed

+26
-3
lines changed

2 files changed

+26
-3
lines changed

python/pyarrow/parquet/core.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1887,10 +1887,23 @@ def read_table(source, *, columns=None, use_threads=True,
18871887
"the 'schema' argument is not supported when the "
18881888
"pyarrow.dataset module is not available"
18891889
)
1890+
if isinstance(source, list):
1891+
raise ValueError(
1892+
"the 'source' argument cannot be a list of files "
1893+
"when the pyarrow.dataset module is not available"
1894+
)
1895+
18901896
filesystem, path = _resolve_filesystem_and_path(source, filesystem)
18911897
if filesystem is not None:
1898+
if not filesystem.get_file_info(path).is_file:
1899+
raise ValueError(
1900+
"the 'source' argument should be "
1901+
"an existing parquet file and not a directory "
1902+
"when the pyarrow.dataset module is not available"
1903+
)
1904+
18921905
source = filesystem.open_input_file(path)
1893-
# TODO test that source is not a directory or a list
1906+
18941907
dataset = ParquetFile(
18951908
source, read_dictionary=read_dictionary,
18961909
binary_type=binary_type,

python/pyarrow/tests/parquet/test_basic.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# under the License.
1717

1818
import os
19+
import sys
1920
from collections import OrderedDict
2021
import io
2122
import warnings
@@ -185,8 +186,7 @@ def __init__(self, *args, **kwargs):
185186
pq.read_table(path, partitioning=['week', 'color'])
186187
with pytest.raises(ValueError, match="the 'schema' argument"):
187188
pq.read_table(path, schema=table.schema)
188-
# Error message varies depending on OS
189-
with pytest.raises(OSError):
189+
with pytest.raises(ValueError, match="the 'source' argument"):
190190
pq.read_table(tempdir)
191191
result = pq.read_table(path)
192192
assert result == table
@@ -993,3 +993,13 @@ def test_checksum_write_to_dataset(tempdir):
993993
# checksum verification enabled raises an exception
994994
with pytest.raises(OSError, match="CRC checksum verification"):
995995
_ = pq.read_table(corrupted_file_path, page_checksum_verification=True)
996+
997+
998+
@pytest.mark.parametrize(
999+
"source", ["/tmp/", ["/tmp/file1.parquet", "/tmp/file2.parquet"]])
1000+
def test_read_table_raises_value_error_when_ds_is_unavailable(monkeypatch, source):
1001+
# GH-47728
1002+
monkeypatch.setitem(sys.modules, "pyarrow.dataset", None)
1003+
1004+
with pytest.raises(ValueError, match="the 'source' argument"):
1005+
pq.read_table(source=source)

0 commit comments

Comments
 (0)