2424import java .util .List ;
2525import org .apache .parquet .Version ;
2626import org .apache .parquet .VersionParser ;
27+ import org .apache .parquet .bytes .BytesInput ;
2728import org .apache .parquet .column .ColumnDescriptor ;
2829import org .apache .parquet .column .ColumnReader ;
30+ import org .apache .parquet .column .Dictionary ;
2931import org .apache .parquet .column .ParquetProperties ;
3032import org .apache .parquet .column .page .DataPage ;
3133import org .apache .parquet .column .page .DataPageV2 ;
34+ import org .apache .parquet .column .page .DictionaryPage ;
3235import org .apache .parquet .column .page .mem .MemPageReader ;
3336import org .apache .parquet .column .page .mem .MemPageWriter ;
3437import org .apache .parquet .io .api .Binary ;
3538import org .apache .parquet .io .api .PrimitiveConverter ;
3639import org .apache .parquet .schema .MessageType ;
3740import org .apache .parquet .schema .MessageTypeParser ;
41+ import org .junit .Assert ;
3842import org .junit .Test ;
3943
4044public class TestColumnReaderImpl {
@@ -53,8 +57,28 @@ public void addBinary(Binary value) {
5357
5458 @ Test
5559 public void test () throws Exception {
60+ ColumnDescriptor col = requiredBinaryColumn ();
61+ MemPageWriter pageWriter = writeBinaryDictColumn (col );
62+ List <DataPage > pages = pageWriter .getPages ();
63+ int valueCount = 0 ;
64+ int rowCount = 0 ;
65+ for (DataPage dataPage : pages ) {
66+ valueCount += dataPage .getValueCount ();
67+ rowCount += ((DataPageV2 ) dataPage ).getRowCount ();
68+ }
69+ assertEquals (rows , rowCount );
70+ assertEquals (rows , valueCount );
71+ MemPageReader pageReader = toReader (pageWriter );
72+ validateExpectedValuesAndCount (col , pageReader );
73+ }
74+
75+ private static ColumnDescriptor requiredBinaryColumn () {
5676 MessageType schema = MessageTypeParser .parseMessageType ("message test { required binary foo; }" );
5777 ColumnDescriptor col = schema .getColumns ().get (0 );
78+ return col ;
79+ }
80+
81+ private MemPageWriter writeBinaryDictColumn (ColumnDescriptor col ) {
5882 MemPageWriter pageWriter = new MemPageWriter ();
5983 ColumnWriterV2 columnWriterV2 = new ColumnWriterV2 (
6084 col ,
@@ -72,16 +96,15 @@ public void test() throws Exception {
7296 }
7397 columnWriterV2 .writePage ();
7498 columnWriterV2 .finalizeColumnChunk ();
75- List <DataPage > pages = pageWriter .getPages ();
76- int valueCount = 0 ;
77- int rowCount = 0 ;
78- for (DataPage dataPage : pages ) {
79- valueCount += dataPage .getValueCount ();
80- rowCount += ((DataPageV2 ) dataPage ).getRowCount ();
81- }
82- assertEquals (rows , rowCount );
83- assertEquals (rows , valueCount );
84- MemPageReader pageReader = new MemPageReader (rows , pages .iterator (), pageWriter .getDictionaryPage ());
99+ return pageWriter ;
100+ }
101+
102+ private MemPageReader toReader (MemPageWriter pageWriter ) {
103+ return new MemPageReader (rows , pageWriter .getPages ().iterator (), pageWriter .getDictionaryPage ());
104+ }
105+
106+ private void validateExpectedValuesAndCount (ColumnDescriptor col , MemPageReader pageReader )
107+ throws VersionParser .VersionParseException {
85108 ValidatingConverter converter = new ValidatingConverter ();
86109 ColumnReader columnReader =
87110 new ColumnReaderImpl (col , pageReader , converter , VersionParser .parse (Version .FULL_VERSION ));
@@ -124,7 +147,7 @@ public void testOptional() throws Exception {
124147 }
125148 assertEquals (rows , rowCount );
126149 assertEquals (rows , valueCount );
127- MemPageReader pageReader = new MemPageReader ( rows , pages . iterator (), pageWriter . getDictionaryPage () );
150+ MemPageReader pageReader = toReader ( pageWriter );
128151 ValidatingConverter converter = new ValidatingConverter ();
129152 ColumnReader columnReader =
130153 new ColumnReaderImpl (col , pageReader , converter , VersionParser .parse (Version .FULL_VERSION ));
@@ -135,4 +158,29 @@ public void testOptional() throws Exception {
135158 }
136159 assertEquals (0 , converter .count );
137160 }
161+
162+ @ Test
163+ public void testDeduplicatedDecodedDictionary () throws Exception {
164+ ColumnDescriptor col = requiredBinaryColumn ();
165+ MemPageWriter pageWriter = writeBinaryDictColumn (col );
166+
167+ DictionaryPage dictionaryPage = pageWriter .getDictionaryPage ();
168+ Assert .assertNotNull ("Expected a dictionary" , dictionaryPage );
169+
170+ Dictionary dict = dictionaryPage .decode (col );
171+
172+ // construct a page reader from a dictionary page that lacks bytes but stores the decoded data.
173+ MemPageReader pageReader = new MemPageReader (
174+ rows ,
175+ pageWriter .getPages ().iterator (),
176+ new DictionaryPage (
177+ BytesInput .empty (), dictionaryPage .getDictionarySize (), dictionaryPage .getEncoding ()) {
178+ @ Override
179+ public Dictionary decode (ColumnDescriptor path ) {
180+ return dict ;
181+ }
182+ });
183+
184+ validateExpectedValuesAndCount (col , pageReader );
185+ }
138186}
0 commit comments