|
47 | 47 | import org.apache.iceberg.MetricsConfig; |
48 | 48 | import org.apache.iceberg.Schema; |
49 | 49 | import org.apache.iceberg.avro.AvroSchemaUtil; |
| 50 | +import org.apache.iceberg.data.Record; |
| 51 | +import org.apache.iceberg.data.parquet.GenericParquetReaders; |
| 52 | +import org.apache.iceberg.io.CloseableIterable; |
50 | 53 | import org.apache.iceberg.io.InputFile; |
51 | 54 | import org.apache.iceberg.mapping.MappingUtil; |
52 | 55 | import org.apache.iceberg.mapping.NameMapping; |
@@ -353,4 +356,144 @@ private Pair<File, Long> generateFile( |
353 | 356 | records.toArray(new GenericData.Record[] {})); |
354 | 357 | return Pair.of(file, size); |
355 | 358 | } |
| 359 | + |
| 360 | + @Test |
| 361 | + public void testReadNestedStructWithoutId() throws IOException { |
| 362 | + Schema icebergSchema = |
| 363 | + new Schema( |
| 364 | + Types.NestedField.required( |
| 365 | + 1, |
| 366 | + "outer_struct", |
| 367 | + Types.StructType.of( |
| 368 | + Types.NestedField.optional( |
| 369 | + 2, |
| 370 | + "middle_struct", |
| 371 | + Types.StructType.of( |
| 372 | + Types.NestedField.optional( |
| 373 | + 3, |
| 374 | + "inner_struct", |
| 375 | + Types.StructType.of( |
| 376 | + Types.NestedField.optional( |
| 377 | + 4, "value_field", Types.StringType.get())))))))); |
| 378 | + |
| 379 | + // Create Avro schema without IDs. |
| 380 | + org.apache.avro.Schema avroSchema = createAvroSchemaWithoutIds(); |
| 381 | + |
| 382 | + // Write test data to Parquet file. |
| 383 | + File file = createTempFile(temp); |
| 384 | + writeParquetFile(file, avroSchema); |
| 385 | + |
| 386 | + // Read and verify the data. |
| 387 | + try (CloseableIterable<Record> reader = |
| 388 | + Parquet.read(Files.localInput(file)) |
| 389 | + .project(icebergSchema) |
| 390 | + .createReaderFunc( |
| 391 | + fileSchema -> GenericParquetReaders.buildReader(icebergSchema, fileSchema)) |
| 392 | + .build()) { |
| 393 | + |
| 394 | + org.apache.iceberg.data.Record readRecord = Iterables.getOnlyElement(reader); |
| 395 | + verifyNestedStructData(readRecord); |
| 396 | + } |
| 397 | + } |
| 398 | + |
| 399 | + private org.apache.avro.Schema createAvroSchemaWithoutIds() { |
| 400 | + org.apache.avro.Schema innerStructSchema = |
| 401 | + org.apache.avro.Schema.createRecord("inner_struct_type", null, null, false); |
| 402 | + innerStructSchema.setFields( |
| 403 | + List.of( |
| 404 | + new org.apache.avro.Schema.Field( |
| 405 | + "value_field", |
| 406 | + org.apache.avro.Schema.createUnion( |
| 407 | + List.of( |
| 408 | + org.apache.avro.Schema.create(org.apache.avro.Schema.Type.NULL), |
| 409 | + org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING))), |
| 410 | + null, |
| 411 | + org.apache.avro.JsonProperties.NULL_VALUE))); |
| 412 | + |
| 413 | + org.apache.avro.Schema middleStructSchema = |
| 414 | + org.apache.avro.Schema.createRecord("middle_struct_type", null, null, false); |
| 415 | + middleStructSchema.setFields( |
| 416 | + List.of( |
| 417 | + new org.apache.avro.Schema.Field( |
| 418 | + "inner_struct", |
| 419 | + org.apache.avro.Schema.createUnion( |
| 420 | + List.of( |
| 421 | + org.apache.avro.Schema.create(org.apache.avro.Schema.Type.NULL), |
| 422 | + innerStructSchema)), |
| 423 | + null, |
| 424 | + org.apache.avro.JsonProperties.NULL_VALUE))); |
| 425 | + |
| 426 | + org.apache.avro.Schema outerStructSchema = |
| 427 | + org.apache.avro.Schema.createRecord("outer_struct_type", null, null, false); |
| 428 | + outerStructSchema.setFields( |
| 429 | + List.of( |
| 430 | + new org.apache.avro.Schema.Field( |
| 431 | + "middle_struct", |
| 432 | + org.apache.avro.Schema.createUnion( |
| 433 | + List.of( |
| 434 | + org.apache.avro.Schema.create(org.apache.avro.Schema.Type.NULL), |
| 435 | + middleStructSchema)), |
| 436 | + null, |
| 437 | + org.apache.avro.JsonProperties.NULL_VALUE))); |
| 438 | + |
| 439 | + org.apache.avro.Schema recordSchema = |
| 440 | + org.apache.avro.Schema.createRecord("test_record", null, null, false); |
| 441 | + recordSchema.setFields( |
| 442 | + List.of(new org.apache.avro.Schema.Field("outer_struct", outerStructSchema, null, null))); |
| 443 | + |
| 444 | + return recordSchema; |
| 445 | + } |
| 446 | + |
| 447 | + private void writeParquetFile(File file, org.apache.avro.Schema avroSchema) throws IOException { |
| 448 | + // Create test data. |
| 449 | + GenericData.Record record = createNestedRecord(avroSchema); |
| 450 | + |
| 451 | + // Write to Parquet file. |
| 452 | + try (ParquetWriter<GenericRecord> writer = |
| 453 | + AvroParquetWriter.<GenericRecord>builder(new org.apache.hadoop.fs.Path(file.toURI())) |
| 454 | + .withSchema(avroSchema) |
| 455 | + .withDataModel(GenericData.get()) |
| 456 | + .build()) { |
| 457 | + writer.write(record); |
| 458 | + } |
| 459 | + } |
| 460 | + |
| 461 | + private GenericData.Record createNestedRecord(org.apache.avro.Schema avroSchema) { |
| 462 | + org.apache.avro.Schema outerSchema = avroSchema.getField("outer_struct").schema(); |
| 463 | + org.apache.avro.Schema middleSchema = |
| 464 | + outerSchema.getField("middle_struct").schema().getTypes().get(1); |
| 465 | + org.apache.avro.Schema innerSchema = |
| 466 | + middleSchema.getField("inner_struct").schema().getTypes().get(1); |
| 467 | + |
| 468 | + GenericRecordBuilder innerBuilder = new GenericRecordBuilder(innerSchema); |
| 469 | + innerBuilder.set("value_field", "test_value"); |
| 470 | + |
| 471 | + GenericRecordBuilder middleBuilder = new GenericRecordBuilder(middleSchema); |
| 472 | + middleBuilder.set("inner_struct", innerBuilder.build()); |
| 473 | + |
| 474 | + GenericRecordBuilder outerBuilder = new GenericRecordBuilder(outerSchema); |
| 475 | + outerBuilder.set("middle_struct", middleBuilder.build()); |
| 476 | + |
| 477 | + GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema); |
| 478 | + recordBuilder.set("outer_struct", outerBuilder.build()); |
| 479 | + |
| 480 | + return recordBuilder.build(); |
| 481 | + } |
| 482 | + |
| 483 | + private void verifyNestedStructData(org.apache.iceberg.data.Record record) { |
| 484 | + org.apache.iceberg.data.Record outerStruct = (org.apache.iceberg.data.Record) record.get(0); |
| 485 | + assertThat(outerStruct).isNotNull().withFailMessage("Outer struct should not be null"); |
| 486 | + |
| 487 | + org.apache.iceberg.data.Record middleStruct = |
| 488 | + (org.apache.iceberg.data.Record) outerStruct.get(0); |
| 489 | + assertThat(middleStruct).isNotNull().withFailMessage("Middle struct should not be null"); |
| 490 | + |
| 491 | + org.apache.iceberg.data.Record innerStruct = |
| 492 | + (org.apache.iceberg.data.Record) middleStruct.get(0); |
| 493 | + assertThat(innerStruct).isNotNull().withFailMessage("Inner struct should not be null"); |
| 494 | + |
| 495 | + assertThat(innerStruct.get(0).toString()) |
| 496 | + .isEqualTo("test_value") |
| 497 | + .withFailMessage("Inner value field should match expected value"); |
| 498 | + } |
356 | 499 | } |
0 commit comments