update

alamb · alamb · commit 2c8b5616d2ff · 2025-07-22T10:44:39.000-04:00
diff --git a/parquet/tests/arrow_reader/io.rs b/parquet/tests/arrow_reader/io.rs
@@ -34,7 +34,7 @@ use parquet::file::properties::WriterProperties;
 use parquet::file::reader::{ChunkReader, Length};
 use parquet::file::FOOTER_SIZE;
 use parquet::format::PageLocation;
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
 use std::io::Read;
 use std::ops::Range;
 use std::sync::{Arc, Mutex};
@@ -221,18 +221,22 @@ struct TestColumnChunk {
     /// The name of the column
     name: String,
 
-    /// The location of the column chunk in the file
-    byte_range: Range<usize>,
+    /// The location of the entire column chunk in the file including data pages
+    /// and data pages.
+    location: Range<usize>,
 
-    /// The location of the data pages in the column chunk
+    /// The offset of the start of of the dictionary page if any
+    dictionary_page_location: Option<i64>,
+
+    /// The location of the data pages in the file
     page_locations: Vec<PageLocation>,
 }
 
 /// Information about the pages in a single row group
 #[derive(Debug)]
 struct TestRowGroup {
     /// Maps column_name -> Information about the column chunk
-    columns: HashMap<String, TestColumnChunk>,
+    columns: BTreeMap<String, TestColumnChunk>,
 }
 
 /// Information about all the row groups in a Parquet file, extracted from its metadata
@@ -255,17 +259,29 @@ impl TestRowGroups {
                     .enumerate()
                     .map(|(col_idx, col_meta)| {
                         let column_name = col_meta.column_descr().name().to_string();
-                        let locations = offset_index[rg_index][col_idx].page_locations();
-                        let (start_offset, end_offset) = col_meta.byte_range();
+                        let page_locations = offset_index[rg_index][col_idx].page_locations().to_vec();
+                        let dictionary_page_location = col_meta
+                            .dictionary_page_offset();
+
+                        // We can find the byte range of the entire column chunk
+                        let (start_offset, length) = col_meta.byte_range();
+                        let start_offset = start_offset as usize;
+                        let end_offset = start_offset + length as usize;
+
                         let test_column_chunk = TestColumnChunk {
                             name: column_name.clone(),
-                            byte_range: start_offset as usize..end_offset as usize,
-                            page_locations: locations.to_vec(),
+                            location: start_offset ..end_offset,
+                            dictionary_page_location,
+                            page_locations,
                         };
 
-                        (column_name, test_column_chunk)
+                        test_column_chunk
+                    })
+                    .map(|test_column_chunk| {
+                        // make key=value pairs to insert into the BTreeMap
+                        (test_column_chunk.name.clone(), test_column_chunk)
                     })
-                    .collect::<HashMap<_, _>>();
+                    .collect::<BTreeMap<_, _>>();
                 TestRowGroup { columns }
             })
             .collect();
@@ -329,6 +345,9 @@ impl OperationLog {
 
     /// Adds entries to the operation log for each interesting object that is
     /// accessed by the specified range
+    ///
+    /// This function checks the ranges in order against possible locations
+    /// and adds the appropriate operation to the log for the first match found.
     fn log_access(&self, range: &Range<usize>) {
         let start = range.start as i64;
         let end = range.end as i64;
@@ -339,50 +358,80 @@ impl OperationLog {
         );
 
         // figure out what logical part of the file this range corresponds to
-        if self.footer_location.contains(&range.start) && self.footer_location.contains(&(range.end-1)) {
+        if self.metadata_location.contains(&range.start) || self.metadata_location.contains(&(range.end-1)) {
             self.add_operation(format!(
-                "Read Footer: {location_description}"
+                "Read Metadata: {location_description}"
             ));
             return;
         }
-        if self.metadata_location.contains(&range.start) && self.metadata_location.contains(&(range.end-1)) {
+
+         if self.footer_location.contains(&range.start) || self.footer_location.contains(&(range.end-1)) {
             self.add_operation(format!(
-                "Read Metadata: {location_description}"
+                "Read Footer: {location_description}"
             ));
-            return;
+             return;
         }
 
-
-        let mut found = false;
+        // Search for the location in each column chunk.
+        //
+        // The actual parquet reader must in general decode the page headers
+        // and determine the byte ranges of the pages. However, for this test
+        // we assume the following layout:
+        //
+        // ```text
+        // (Dictionary Page)
+        // (Data Page)
+        // ...
+        // (Data Page)
+        // ```
+        //
+        // We also assume that `self.page_locations` holds the location of all
+        // data pages, so any read operation that overlaps with a data page
+        // location is considered a read of that page, and any other read must
+        // be a dictionary page read.
         for (row_group_index, row_group) in self.row_groups.iter().enumerate() {
             for (column_name, test_column_chunk) in &row_group.columns {
-                let column_byte_range = &test_column_chunk.byte_range;
+                // Check if the range overlaps with any data page locations
+                let page_locations = test_column_chunk.page_locations.iter();
+                for (page_index, page_location) in page_locations.enumerate() {
+                    let page_offset = page_location.offset as i64;
+                    let page_end = page_offset + page_location.compressed_page_size as i64;
+                    if start >= page_offset && end <= page_end {
+                        self.add_operation(format!(
+                            "Read Row Group {row_group_index}, column '{column_name}', Data Page {page_index}: {location_description}",
+                        ));
+                        return;
+                    }
+                }
+
+                // Check if the range overlaps with the dictionary page location
+                if let Some(dict_page_offset) = test_column_chunk.dictionary_page_location {
+                    let dict_page_end = dict_page_offset + test_column_chunk.location.len() as i64;
+                    if start >= dict_page_offset as i64 && end < dict_page_end {
+                        self.add_operation(format!(
+                            "Read Row Group {row_group_index}, column '{column_name}': Dictionary Page: {location_description}",
+                        ));
+                        return;
+                    }
+                }
+
+                let column_byte_range = &test_column_chunk.location;
                 if column_byte_range.contains(&range.start)
                     && column_byte_range.contains(&(range.end - 1))
                 {
-                    found = true;
                     self.add_operation(format!(
                         "Read Row Group {row_group_index}, column '{column_name}': {location_description}",
                     ));
-                }
-                // Check if the range overlaps with any of the page locations
-                let page_locations = test_column_chunk.page_locations.iter();
-                for (page_index, page_location) in page_locations.enumerate() {
-                    if page_location.offset >= start && page_location.offset < end {
-                        found = true;
-                        self.add_operation(format!(
-                            "Read Row Group {row_group_index}, column '{column_name}', page {page_index}: {location_description}",
-                        ));
-                    }
+                    return;
                 }
             }
         }
 
-        if !found {
+        // If we reach here, the range does not match any known logical part of the file
             self.add_operation(format!(
                 "UNKNOWN: {location_description}"
             ));
-        }
+
     }
 }