Skip to content

Commit 277faa9

Browse files
authored
GH-48096 [Python][Parquet] Expose new WriterProperties::max_rows_per_page to Python bindings (#48101)
### Rationale for this change See #48096, exposes `parquet.WriterProperties max_rows_per_page` argument to Python's API. ### What changes are included in this PR? Added the argument ### Are these changes tested? Yes, since the metadata doesn't have any info about the number of pages, a naive end-to-end test was used to ensure the implementation correctness. ### Are there any user-facing changes? The ability to set the `max_rows_per_page` directly from PyArrow. * GitHub Issue: #48096 Authored-by: Bogdan Romenskii <[email protected]> Signed-off-by: Raúl Cumplido <[email protected]>
1 parent f096d48 commit 277faa9

File tree

6 files changed

+75
-0
lines changed

6 files changed

+75
-0
lines changed

python/pyarrow/_dataset_parquet.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
646646
version=self._properties["version"],
647647
write_statistics=self._properties["write_statistics"],
648648
data_page_size=self._properties["data_page_size"],
649+
max_rows_per_page=self._properties["max_rows_per_page"],
649650
compression_level=self._properties["compression_level"],
650651
use_byte_stream_split=(
651652
self._properties["use_byte_stream_split"]
@@ -695,6 +696,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
695696
version="2.6",
696697
write_statistics=None,
697698
data_page_size=None,
699+
max_rows_per_page=None,
698700
compression_level=None,
699701
use_byte_stream_split=False,
700702
column_encoding=None,

python/pyarrow/_parquet.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
4444
version=*,
4545
write_statistics=*,
4646
data_page_size=*,
47+
max_rows_per_page=*,
4748
compression_level=*,
4849
use_byte_stream_split=*,
4950
column_encoding=*,

python/pyarrow/_parquet.pyx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1984,6 +1984,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
19841984
version=None,
19851985
write_statistics=None,
19861986
data_page_size=None,
1987+
max_rows_per_page=None,
19871988
compression_level=None,
19881989
use_byte_stream_split=False,
19891990
column_encoding=None,
@@ -2129,6 +2130,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
21292130
if data_page_size is not None:
21302131
props.data_pagesize(data_page_size)
21312132

2133+
if max_rows_per_page is not None:
2134+
props.max_rows_per_page(max_rows_per_page)
2135+
21322136
if write_batch_size is not None:
21332137
props.write_batch_size(write_batch_size)
21342138

@@ -2300,6 +2304,7 @@ cdef class ParquetWriter(_Weakrefable):
23002304
use_deprecated_int96_timestamps=False,
23012305
coerce_timestamps=None,
23022306
data_page_size=None,
2307+
max_rows_per_page=None,
23032308
allow_truncated_timestamps=False,
23042309
compression_level=None,
23052310
use_byte_stream_split=False,
@@ -2340,6 +2345,7 @@ cdef class ParquetWriter(_Weakrefable):
23402345
version=version,
23412346
write_statistics=write_statistics,
23422347
data_page_size=data_page_size,
2348+
max_rows_per_page=max_rows_per_page,
23432349
compression_level=compression_level,
23442350
use_byte_stream_split=use_byte_stream_split,
23452351
column_encoding=column_encoding,

python/pyarrow/includes/libparquet.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
492492
Builder* enable_store_decimal_as_integer()
493493
Builder* disable_store_decimal_as_integer()
494494
Builder* data_pagesize(int64_t size)
495+
Builder* max_rows_per_page(int64_t max_rows)
495496
Builder* encoding(ParquetEncoding encoding)
496497
Builder* encoding(const c_string& path,
497498
ParquetEncoding encoding)

python/pyarrow/parquet/core.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -795,6 +795,10 @@ def _sanitize_table(table, new_schema, flavor):
795795
Set a target threshold for the approximate encoded size of data
796796
pages within a column chunk (in bytes). If None, use the default data page
797797
size of 1MByte.
798+
max_rows_per_page : int, default None
799+
Maximum number of rows per page within a column chunk.
800+
If None, use the default of 20000.
801+
Smaller values reduce memory usage during reads but increase metadata overhead.
798802
flavor : {'spark'}, default None
799803
Sanitize schema or set other compatibility options to work with
800804
various target systems.
@@ -1042,6 +1046,7 @@ def __init__(self, where, schema, filesystem=None,
10421046
sorting_columns=None,
10431047
store_decimal_as_integer=False,
10441048
write_time_adjusted_to_utc=False,
1049+
max_rows_per_page=None,
10451050
**options):
10461051
if use_deprecated_int96_timestamps is None:
10471052
# Use int96 timestamps for Spark
@@ -1096,6 +1101,7 @@ def __init__(self, where, schema, filesystem=None,
10961101
sorting_columns=sorting_columns,
10971102
store_decimal_as_integer=store_decimal_as_integer,
10981103
write_time_adjusted_to_utc=write_time_adjusted_to_utc,
1104+
max_rows_per_page=max_rows_per_page,
10991105
**options)
11001106
self.is_open = True
11011107

@@ -1971,6 +1977,7 @@ def write_table(table, where, row_group_size=None, version='2.6',
19711977
sorting_columns=None,
19721978
store_decimal_as_integer=False,
19731979
write_time_adjusted_to_utc=False,
1980+
max_rows_per_page=None,
19741981
**kwargs):
19751982
# Implementor's note: when adding keywords here / updating defaults, also
19761983
# update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions
@@ -2003,6 +2010,7 @@ def write_table(table, where, row_group_size=None, version='2.6',
20032010
sorting_columns=sorting_columns,
20042011
store_decimal_as_integer=store_decimal_as_integer,
20052012
write_time_adjusted_to_utc=write_time_adjusted_to_utc,
2013+
max_rows_per_page=max_rows_per_page,
20062014
**kwargs) as writer:
20072015
writer.write_table(table, row_group_size=row_group_size)
20082016
except Exception:

python/pyarrow/tests/parquet/test_parquet_writer.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,3 +487,60 @@ def test_arrow_writer_props_time_adjusted_to_utc(
487487
result.validate(full=True)
488488

489489
assert result.equals(table)
490+
491+
492+
@pytest.mark.parametrize(
493+
"max_rows_per_page",
494+
[1, 10, 100, 1_000, None],
495+
)
496+
def test_writer_props_max_rows_per_page(tempdir, max_rows_per_page):
497+
# GH-48096
498+
filename = tempdir / "max_rows_per_page.parquet"
499+
500+
table = pa.table({
501+
"x": pa.array([1, 2, 3, 4, 5, 6, 7], type=pa.int8()),
502+
"y": pa.array([11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0], type=pa.float16()),
503+
})
504+
505+
schema = pa.schema([
506+
("x", pa.int8()),
507+
("y", pa.float16()),
508+
])
509+
510+
with pq.ParquetWriter(
511+
where=filename,
512+
schema=schema,
513+
max_rows_per_page=max_rows_per_page,
514+
) as writer:
515+
writer.write_table(table)
516+
517+
result = pq.read_table(filename, schema=schema)
518+
519+
result.validate(full=True)
520+
521+
assert result.equals(table)
522+
523+
524+
def test_writer_props_max_rows_per_page_file_size(tempdir):
525+
# GH-48096
526+
table = pa.table({
527+
"x": pa.array(range(1_000_000))
528+
})
529+
530+
local = fs.LocalFileSystem()
531+
file_infos = []
532+
533+
for max_rows in (1_000, 10_000):
534+
path = f"{tempdir}/max_rows_per_page_{max_rows}.parquet"
535+
536+
with pq.ParquetWriter(
537+
where=path,
538+
schema=table.schema,
539+
max_rows_per_page=max_rows,
540+
) as writer:
541+
writer.write_table(table)
542+
543+
file_infos.append(local.get_file_info(path))
544+
545+
# A smaller maximum rows parameter should produce a larger file
546+
assert file_infos[0].size > file_infos[1].size

0 commit comments

Comments
 (0)