Skip to content

Skip page should also support skip dict page #7409

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 16, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 58 additions & 2 deletions parquet/src/file/serialized_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -981,8 +981,18 @@ impl<R: ChunkReader> PageReader for SerializedPageReader<R> {
}
Ok(())
}
SerializedPageReaderState::Pages { page_locations, .. } => {
page_locations.pop_front();
SerializedPageReaderState::Pages {
page_locations,
dictionary_page,
..
} => {
if dictionary_page.is_some() {
// If a dictionary page exists, consume it by taking it (sets to None)
dictionary_page.take();
} else {
// If no dictionary page exists, simply pop the data page from page_locations
page_locations.pop_front();
}

Ok(())
}
Expand Down Expand Up @@ -1895,6 +1905,52 @@ mod tests {
)
}

#[test]
fn test_skip_next_page_with_dictionary_page() {
let test_file = get_test_file("alltypes_tiny_pages.parquet");
let builder = ReadOptionsBuilder::new();
// enable read page index
let options = builder.with_page_index().build();
let reader_result = SerializedFileReader::new_with_options(test_file, options);
let reader = reader_result.unwrap();

let row_group_reader = reader.get_row_group(0).unwrap();

// use 'string_col', Boundary order: UNORDERED, total 352 data pages and 1 dictionary page.
let mut column_page_reader = row_group_reader.get_column_page_reader(9).unwrap();

let mut vec = vec![];

// Step 1: Peek and ensure dictionary page is correctly identified
let meta = column_page_reader.peek_next_page().unwrap().unwrap();
assert!(meta.is_dict);

// Step 2: Call skip_next_page to skip the dictionary page
column_page_reader.skip_next_page().unwrap();

// Step 3: Read the next data page after skipping the dictionary page
let page = column_page_reader.get_next_page().unwrap().unwrap();
assert!(matches!(page.page_type(), basic::PageType::DATA_PAGE));

// Step 4: Continue reading remaining data pages and verify correctness
for _i in 0..351 {
// 352 total pages, 1 dictionary page is skipped
let meta = column_page_reader.peek_next_page().unwrap().unwrap();
assert!(!meta.is_dict); // Verify no dictionary page here
vec.push(meta);

let page = column_page_reader.get_next_page().unwrap().unwrap();
assert!(matches!(page.page_type(), basic::PageType::DATA_PAGE));
}

// Step 5: Check if all pages are read
assert!(column_page_reader.peek_next_page().unwrap().is_none());
assert!(column_page_reader.get_next_page().unwrap().is_none());

// Step 6: Verify the number of data pages read (should be 351 data pages)
assert_eq!(vec.len(), 351);
}

#[test]
fn test_skip_page_with_offset_index() {
let test_file = get_test_file("alltypes_tiny_pages_plain.parquet");
Expand Down
Loading