|
84 | 84 | from pyiceberg.table import FileScanTask, TableProperties
|
85 | 85 | from pyiceberg.table.metadata import TableMetadataV2
|
86 | 86 | from pyiceberg.table.name_mapping import create_mapping_from_schema
|
87 |
| -from pyiceberg.transforms import IdentityTransform |
| 87 | +from pyiceberg.transforms import HourTransform, IdentityTransform |
88 | 88 | from pyiceberg.typedef import UTF8, Properties, Record
|
89 | 89 | from pyiceberg.types import (
|
90 | 90 | BinaryType,
|
@@ -2350,6 +2350,102 @@ def test_partition_for_demo() -> None:
|
2350 | 2350 | )
|
2351 | 2351 |
|
2352 | 2352 |
|
| 2353 | +def test_partition_for_nested_field() -> None: |
| 2354 | + schema = Schema( |
| 2355 | + NestedField(id=1, name="foo", field_type=StringType(), required=True), |
| 2356 | + NestedField( |
| 2357 | + id=2, |
| 2358 | + name="bar", |
| 2359 | + field_type=StructType( |
| 2360 | + NestedField(id=3, name="baz", field_type=TimestampType(), required=False), |
| 2361 | + NestedField(id=4, name="qux", field_type=IntegerType(), required=False), |
| 2362 | + ), |
| 2363 | + required=True, |
| 2364 | + ), |
| 2365 | + ) |
| 2366 | + |
| 2367 | + spec = PartitionSpec(PartitionField(source_id=3, field_id=1000, transform=HourTransform(), name="ts")) |
| 2368 | + |
| 2369 | + from datetime import datetime |
| 2370 | + |
| 2371 | + t1 = datetime(2025, 7, 11, 9, 30, 0) |
| 2372 | + t2 = datetime(2025, 7, 11, 10, 30, 0) |
| 2373 | + |
| 2374 | + test_data = [ |
| 2375 | + {"foo": "a", "bar": {"baz": t1, "qux": 1}}, |
| 2376 | + {"foo": "b", "bar": {"baz": t2, "qux": 2}}, |
| 2377 | + ] |
| 2378 | + |
| 2379 | + arrow_table = pa.Table.from_pylist(test_data, schema=schema.as_arrow()) |
| 2380 | + partitions = _determine_partitions(spec, schema, arrow_table) |
| 2381 | + partition_values = {p.partition_key.partition[0] for p in partitions} |
| 2382 | + |
| 2383 | + assert partition_values == {486729, 486730} |
| 2384 | + |
| 2385 | + |
| 2386 | +def test_partition_for_deep_nested_field() -> None: |
| 2387 | + schema = Schema( |
| 2388 | + NestedField( |
| 2389 | + id=1, |
| 2390 | + name="foo", |
| 2391 | + field_type=StructType( |
| 2392 | + NestedField( |
| 2393 | + id=2, |
| 2394 | + name="bar", |
| 2395 | + field_type=StructType(NestedField(id=3, name="baz", field_type=StringType(), required=False)), |
| 2396 | + required=True, |
| 2397 | + ) |
| 2398 | + ), |
| 2399 | + required=True, |
| 2400 | + ) |
| 2401 | + ) |
| 2402 | + |
| 2403 | + spec = PartitionSpec(PartitionField(source_id=3, field_id=1000, transform=IdentityTransform(), name="qux")) |
| 2404 | + |
| 2405 | + test_data = [ |
| 2406 | + {"foo": {"bar": {"baz": "data-1"}}}, |
| 2407 | + {"foo": {"bar": {"baz": "data-2"}}}, |
| 2408 | + {"foo": {"bar": {"baz": "data-1"}}}, |
| 2409 | + ] |
| 2410 | + |
| 2411 | + arrow_table = pa.Table.from_pylist(test_data, schema=schema.as_arrow()) |
| 2412 | + partitions = _determine_partitions(spec, schema, arrow_table) |
| 2413 | + |
| 2414 | + assert len(partitions) == 2 # 2 unique partitions |
| 2415 | + partition_values = {p.partition_key.partition[0] for p in partitions} |
| 2416 | + assert partition_values == {"data-1", "data-2"} |
| 2417 | + |
| 2418 | + |
| 2419 | +def test_inspect_partition_for_nested_field(catalog: InMemoryCatalog) -> None: |
| 2420 | + schema = Schema( |
| 2421 | + NestedField(id=1, name="foo", field_type=StringType(), required=True), |
| 2422 | + NestedField( |
| 2423 | + id=2, |
| 2424 | + name="bar", |
| 2425 | + field_type=StructType( |
| 2426 | + NestedField(id=3, name="baz", field_type=StringType(), required=False), |
| 2427 | + NestedField(id=4, name="qux", field_type=IntegerType(), required=False), |
| 2428 | + ), |
| 2429 | + required=True, |
| 2430 | + ), |
| 2431 | + ) |
| 2432 | + spec = PartitionSpec(PartitionField(source_id=3, field_id=1000, transform=IdentityTransform(), name="part")) |
| 2433 | + catalog.create_namespace("default") |
| 2434 | + table = catalog.create_table("default.test_partition_in_struct", schema=schema, partition_spec=spec) |
| 2435 | + test_data = [ |
| 2436 | + {"foo": "a", "bar": {"baz": "data-a", "qux": 1}}, |
| 2437 | + {"foo": "b", "bar": {"baz": "data-b", "qux": 2}}, |
| 2438 | + ] |
| 2439 | + |
| 2440 | + arrow_table = pa.Table.from_pylist(test_data, schema=table.schema().as_arrow()) |
| 2441 | + table.append(arrow_table) |
| 2442 | + partitions_table = table.inspect.partitions() |
| 2443 | + partitions = partitions_table["partition"].to_pylist() |
| 2444 | + |
| 2445 | + assert len(partitions) == 2 |
| 2446 | + assert {part["part"] for part in partitions} == {"data-a", "data-b"} |
| 2447 | + |
| 2448 | + |
2353 | 2449 | def test_identity_partition_on_multi_columns() -> None:
|
2354 | 2450 | test_pa_schema = pa.schema([("born_year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())])
|
2355 | 2451 | test_schema = Schema(
|
|
0 commit comments