Skip to content

Commit 63d0045

Browse files
Ashok VengalaJennifer Baldwin
authored andcommitted
merge OTF-1902
1 parent 6718864 commit 63d0045

File tree

3 files changed

+155
-5
lines changed

3 files changed

+155
-5
lines changed

parquet/src/main/java/org/apache/iceberg/data/parquet/BaseParquetReaders.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,10 @@ public ParquetValueReader<?> struct(
234234
if (fieldReader != null) {
235235
Type fieldType = fields.get(i);
236236
int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())) - 1;
237-
int id = fieldType.getId().intValue();
237+
int id =
238+
fieldType.getId() != null
239+
? fieldType.getId().intValue()
240+
: i < expected.fields().size() ? expected.fields().get(i).fieldId() : -1;
238241
readersById.put(id, ParquetValueReaders.option(fieldType, fieldD, fieldReader));
239242
}
240243
}

parquet/src/main/java/org/apache/iceberg/parquet/TypeWithSchemaVisitor.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -199,11 +199,15 @@ private static <T> T visitField(
199199
private static <T> List<T> visitFields(
200200
Types.StructType struct, GroupType group, TypeWithSchemaVisitor<T> visitor) {
201201
List<T> results = Lists.newArrayListWithExpectedSize(group.getFieldCount());
202+
int fieldIdFromStruct = 0;
202203
for (Type field : group.getFields()) {
203-
int id = -1;
204-
if (field.getId() != null) {
205-
id = field.getId().intValue();
206-
}
204+
int id =
205+
field.getId() != null
206+
? field.getId().intValue()
207+
: (struct != null && fieldIdFromStruct < struct.fields().size())
208+
? struct.fields().get(fieldIdFromStruct).fieldId()
209+
: -1;
210+
fieldIdFromStruct++;
207211
Types.NestedField iField = (struct != null && id >= 0) ? struct.field(id) : null;
208212
results.add(visitField(iField, field, visitor));
209213
}

parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@
4747
import org.apache.iceberg.MetricsConfig;
4848
import org.apache.iceberg.Schema;
4949
import org.apache.iceberg.avro.AvroSchemaUtil;
50+
import org.apache.iceberg.data.Record;
51+
import org.apache.iceberg.data.parquet.GenericParquetReaders;
52+
import org.apache.iceberg.io.CloseableIterable;
5053
import org.apache.iceberg.io.InputFile;
5154
import org.apache.iceberg.mapping.MappingUtil;
5255
import org.apache.iceberg.mapping.NameMapping;
@@ -353,4 +356,144 @@ private Pair<File, Long> generateFile(
353356
records.toArray(new GenericData.Record[] {}));
354357
return Pair.of(file, size);
355358
}
359+
360+
@Test
361+
public void testReadNestedStructWithoutId() throws IOException {
362+
Schema icebergSchema =
363+
new Schema(
364+
Types.NestedField.required(
365+
1,
366+
"outer_struct",
367+
Types.StructType.of(
368+
Types.NestedField.optional(
369+
2,
370+
"middle_struct",
371+
Types.StructType.of(
372+
Types.NestedField.optional(
373+
3,
374+
"inner_struct",
375+
Types.StructType.of(
376+
Types.NestedField.optional(
377+
4, "value_field", Types.StringType.get()))))))));
378+
379+
// Create Avro schema without IDs.
380+
org.apache.avro.Schema avroSchema = createAvroSchemaWithoutIds();
381+
382+
// Write test data to Parquet file.
383+
File file = createTempFile(temp);
384+
writeParquetFile(file, avroSchema);
385+
386+
// Read and verify the data.
387+
try (CloseableIterable<Record> reader =
388+
Parquet.read(Files.localInput(file))
389+
.project(icebergSchema)
390+
.createReaderFunc(
391+
fileSchema -> GenericParquetReaders.buildReader(icebergSchema, fileSchema))
392+
.build()) {
393+
394+
org.apache.iceberg.data.Record readRecord = Iterables.getOnlyElement(reader);
395+
verifyNestedStructData(readRecord);
396+
}
397+
}
398+
399+
private org.apache.avro.Schema createAvroSchemaWithoutIds() {
400+
org.apache.avro.Schema innerStructSchema =
401+
org.apache.avro.Schema.createRecord("inner_struct_type", null, null, false);
402+
innerStructSchema.setFields(
403+
List.of(
404+
new org.apache.avro.Schema.Field(
405+
"value_field",
406+
org.apache.avro.Schema.createUnion(
407+
List.of(
408+
org.apache.avro.Schema.create(org.apache.avro.Schema.Type.NULL),
409+
org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING))),
410+
null,
411+
org.apache.avro.JsonProperties.NULL_VALUE)));
412+
413+
org.apache.avro.Schema middleStructSchema =
414+
org.apache.avro.Schema.createRecord("middle_struct_type", null, null, false);
415+
middleStructSchema.setFields(
416+
List.of(
417+
new org.apache.avro.Schema.Field(
418+
"inner_struct",
419+
org.apache.avro.Schema.createUnion(
420+
List.of(
421+
org.apache.avro.Schema.create(org.apache.avro.Schema.Type.NULL),
422+
innerStructSchema)),
423+
null,
424+
org.apache.avro.JsonProperties.NULL_VALUE)));
425+
426+
org.apache.avro.Schema outerStructSchema =
427+
org.apache.avro.Schema.createRecord("outer_struct_type", null, null, false);
428+
outerStructSchema.setFields(
429+
List.of(
430+
new org.apache.avro.Schema.Field(
431+
"middle_struct",
432+
org.apache.avro.Schema.createUnion(
433+
List.of(
434+
org.apache.avro.Schema.create(org.apache.avro.Schema.Type.NULL),
435+
middleStructSchema)),
436+
null,
437+
org.apache.avro.JsonProperties.NULL_VALUE)));
438+
439+
org.apache.avro.Schema recordSchema =
440+
org.apache.avro.Schema.createRecord("test_record", null, null, false);
441+
recordSchema.setFields(
442+
List.of(new org.apache.avro.Schema.Field("outer_struct", outerStructSchema, null, null)));
443+
444+
return recordSchema;
445+
}
446+
447+
private void writeParquetFile(File file, org.apache.avro.Schema avroSchema) throws IOException {
448+
// Create test data.
449+
GenericData.Record record = createNestedRecord(avroSchema);
450+
451+
// Write to Parquet file.
452+
try (ParquetWriter<GenericRecord> writer =
453+
AvroParquetWriter.<GenericRecord>builder(new org.apache.hadoop.fs.Path(file.toURI()))
454+
.withSchema(avroSchema)
455+
.withDataModel(GenericData.get())
456+
.build()) {
457+
writer.write(record);
458+
}
459+
}
460+
461+
private GenericData.Record createNestedRecord(org.apache.avro.Schema avroSchema) {
462+
org.apache.avro.Schema outerSchema = avroSchema.getField("outer_struct").schema();
463+
org.apache.avro.Schema middleSchema =
464+
outerSchema.getField("middle_struct").schema().getTypes().get(1);
465+
org.apache.avro.Schema innerSchema =
466+
middleSchema.getField("inner_struct").schema().getTypes().get(1);
467+
468+
GenericRecordBuilder innerBuilder = new GenericRecordBuilder(innerSchema);
469+
innerBuilder.set("value_field", "test_value");
470+
471+
GenericRecordBuilder middleBuilder = new GenericRecordBuilder(middleSchema);
472+
middleBuilder.set("inner_struct", innerBuilder.build());
473+
474+
GenericRecordBuilder outerBuilder = new GenericRecordBuilder(outerSchema);
475+
outerBuilder.set("middle_struct", middleBuilder.build());
476+
477+
GenericRecordBuilder recordBuilder = new GenericRecordBuilder(avroSchema);
478+
recordBuilder.set("outer_struct", outerBuilder.build());
479+
480+
return recordBuilder.build();
481+
}
482+
483+
private void verifyNestedStructData(org.apache.iceberg.data.Record record) {
484+
org.apache.iceberg.data.Record outerStruct = (org.apache.iceberg.data.Record) record.get(0);
485+
assertThat(outerStruct).isNotNull().withFailMessage("Outer struct should not be null");
486+
487+
org.apache.iceberg.data.Record middleStruct =
488+
(org.apache.iceberg.data.Record) outerStruct.get(0);
489+
assertThat(middleStruct).isNotNull().withFailMessage("Middle struct should not be null");
490+
491+
org.apache.iceberg.data.Record innerStruct =
492+
(org.apache.iceberg.data.Record) middleStruct.get(0);
493+
assertThat(innerStruct).isNotNull().withFailMessage("Inner struct should not be null");
494+
495+
assertThat(innerStruct.get(0).toString())
496+
.isEqualTo("test_value")
497+
.withFailMessage("Inner value field should match expected value");
498+
}
356499
}

0 commit comments

Comments
 (0)