Skip to content

Commit e7a0008

Browse files
authored
Implement dictionary support for reading ByteView from parquet (#5973)
* implement dictionary encoding support * update comments
1 parent 035b589 commit e7a0008

File tree

2 files changed

+125
-9
lines changed

2 files changed

+125
-9
lines changed

parquet/src/arrow/array_reader/byte_view_array.rs

+112-9
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
use crate::arrow::array_reader::{read_records, skip_records, ArrayReader};
1919
use crate::arrow::buffer::view_buffer::ViewBuffer;
20+
use crate::arrow::decoder::DictIndexDecoder;
2021
use crate::arrow::record_reader::GenericRecordReader;
2122
use crate::arrow::schema::parquet_to_arrow_field;
2223
use crate::basic::{ConvertedType, Encoding};
@@ -25,6 +26,7 @@ use crate::column::reader::decoder::ColumnValueDecoder;
2526
use crate::errors::{ParquetError, Result};
2627
use crate::schema::types::ColumnDescPtr;
2728
use arrow_array::ArrayRef;
29+
use arrow_data::ByteView;
2830
use arrow_schema::DataType as ArrowType;
2931
use bytes::Bytes;
3032
use std::any::Any;
@@ -210,6 +212,7 @@ impl ColumnValueDecoder for ByteViewArrayColumnValueDecoder {
210212
/// A generic decoder from uncompressed parquet value data to [`ViewBuffer`]
211213
pub enum ByteViewArrayDecoder {
212214
Plain(ByteViewArrayDecoderPlain),
215+
Dictionary(ByteViewArrayDecoderDictionary),
213216
}
214217

215218
impl ByteViewArrayDecoder {
@@ -227,10 +230,14 @@ impl ByteViewArrayDecoder {
227230
num_values,
228231
validate_utf8,
229232
)),
230-
Encoding::RLE_DICTIONARY
231-
| Encoding::PLAIN_DICTIONARY
232-
| Encoding::DELTA_LENGTH_BYTE_ARRAY
233-
| Encoding::DELTA_BYTE_ARRAY => unimplemented!("stay tuned!"),
233+
Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => {
234+
ByteViewArrayDecoder::Dictionary(ByteViewArrayDecoderDictionary::new(
235+
data, num_levels, num_values,
236+
))
237+
}
238+
Encoding::DELTA_LENGTH_BYTE_ARRAY | Encoding::DELTA_BYTE_ARRAY => {
239+
unimplemented!("stay tuned!")
240+
}
234241
_ => {
235242
return Err(general_err!(
236243
"unsupported encoding for byte array: {}",
@@ -247,17 +254,27 @@ impl ByteViewArrayDecoder {
247254
&mut self,
248255
out: &mut ViewBuffer,
249256
len: usize,
250-
_dict: Option<&ViewBuffer>,
257+
dict: Option<&ViewBuffer>,
251258
) -> Result<usize> {
252259
match self {
253260
ByteViewArrayDecoder::Plain(d) => d.read(out, len),
261+
ByteViewArrayDecoder::Dictionary(d) => {
262+
let dict = dict
263+
.ok_or_else(|| general_err!("dictionary required for dictionary encoding"))?;
264+
d.read(out, dict, len)
265+
}
254266
}
255267
}
256268

257269
/// Skip `len` values
258-
pub fn skip(&mut self, len: usize, _dict: Option<&ViewBuffer>) -> Result<usize> {
270+
pub fn skip(&mut self, len: usize, dict: Option<&ViewBuffer>) -> Result<usize> {
259271
match self {
260272
ByteViewArrayDecoder::Plain(d) => d.skip(len),
273+
ByteViewArrayDecoder::Dictionary(d) => {
274+
let dict = dict
275+
.ok_or_else(|| general_err!("dictionary required for dictionary encoding"))?;
276+
d.skip(dict, len)
277+
}
261278
}
262279
}
263280
}
@@ -348,6 +365,90 @@ impl ByteViewArrayDecoderPlain {
348365
}
349366
}
350367

368+
pub struct ByteViewArrayDecoderDictionary {
369+
decoder: DictIndexDecoder,
370+
}
371+
372+
impl ByteViewArrayDecoderDictionary {
373+
fn new(data: Bytes, num_levels: usize, num_values: Option<usize>) -> Self {
374+
Self {
375+
decoder: DictIndexDecoder::new(data, num_levels, num_values),
376+
}
377+
}
378+
379+
/// Reads the next indexes from self.decoder
380+
/// the indexes are assumed to be indexes into `dict`
381+
/// the output values are written to output
382+
///
383+
/// Assumptions / Optimization
384+
/// This function checks if dict.buffers() are the last buffers in `output`, and if so
385+
/// reuses the dictionary page buffers directly without copying data
386+
fn read(&mut self, output: &mut ViewBuffer, dict: &ViewBuffer, len: usize) -> Result<usize> {
387+
if dict.is_empty() || len == 0 {
388+
return Ok(0);
389+
}
390+
391+
// Check if the last few buffer of `output`` are the same as the `dict` buffer
392+
// This is to avoid creating a new buffers if the same dictionary is used for multiple `read`
393+
let need_to_create_new_buffer = {
394+
if output.buffers.len() >= dict.buffers.len() {
395+
let offset = output.buffers.len() - dict.buffers.len();
396+
output.buffers[offset..]
397+
.iter()
398+
.zip(dict.buffers.iter())
399+
.any(|(a, b)| !a.ptr_eq(b))
400+
} else {
401+
true
402+
}
403+
};
404+
405+
if need_to_create_new_buffer {
406+
for b in dict.buffers.iter() {
407+
output.buffers.push(b.clone());
408+
}
409+
}
410+
411+
// Calculate the offset of the dictionary buffers in the output buffers
412+
// For example if the 2nd buffer in the dictionary is the 5th buffer in the output buffers,
413+
// then the base_buffer_idx is 5 - 2 = 3
414+
let base_buffer_idx = output.buffers.len() as u32 - dict.buffers.len() as u32;
415+
416+
self.decoder.read(len, |keys| {
417+
for k in keys {
418+
let view = dict
419+
.views
420+
.get(*k as usize)
421+
.ok_or_else(|| general_err!("invalid key={} for dictionary", *k))?;
422+
let len = *view as u32;
423+
if len <= 12 {
424+
// directly append the view if it is inlined
425+
// Safety: the view is from the dictionary, so it is valid
426+
unsafe {
427+
output.append_raw_view_unchecked(view);
428+
}
429+
} else {
430+
// correct the buffer index and append the view
431+
let mut view = ByteView::from(*view);
432+
view.buffer_index += base_buffer_idx;
433+
// Safety: the view is from the dictionary,
434+
// we corrected the index value to point it to output buffer, so it is valid
435+
unsafe {
436+
output.append_raw_view_unchecked(&view.into());
437+
}
438+
}
439+
}
440+
Ok(())
441+
})
442+
}
443+
444+
fn skip(&mut self, dict: &ViewBuffer, to_skip: usize) -> Result<usize> {
445+
if dict.is_empty() {
446+
return Ok(0);
447+
}
448+
self.decoder.skip(to_skip)
449+
}
450+
}
451+
351452
/// Check that `val` is a valid UTF-8 sequence
352453
pub fn check_valid_utf8(val: &[u8]) -> Result<()> {
353454
match std::str::from_utf8(val) {
@@ -386,8 +487,11 @@ mod tests {
386487
.unwrap();
387488

388489
for (encoding, page) in pages {
389-
if encoding != Encoding::PLAIN {
390-
// skip non-plain encodings for now as they are not yet implemented
490+
if encoding != Encoding::PLAIN
491+
&& encoding != Encoding::RLE_DICTIONARY
492+
&& encoding != Encoding::PLAIN_DICTIONARY
493+
{
494+
// skip unsupported encodings for now as they are not yet implemented
391495
continue;
392496
}
393497
let mut output = ViewBuffer::default();
@@ -399,7 +503,6 @@ mod tests {
399503
assert_eq!(decoder.read(&mut output, 4).unwrap(), 0);
400504

401505
assert_eq!(output.views.len(), 4);
402-
assert_eq!(output.buffers.len(), 4);
403506

404507
let valid = [false, false, true, true, false, true, true, false, false];
405508
let valid_buffer = Buffer::from_iter(valid.iter().cloned());

parquet/src/arrow/buffer/view_buffer.rs

+13
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ pub struct ViewBuffer {
3333
}
3434

3535
impl ViewBuffer {
36+
pub fn is_empty(&self) -> bool {
37+
self.views.is_empty()
38+
}
39+
3640
#[allow(unused)]
3741
pub fn append_block(&mut self, block: Buffer) -> u32 {
3842
let block_id = self.buffers.len() as u32;
@@ -56,6 +60,15 @@ impl ViewBuffer {
5660
self.views.push(view);
5761
}
5862

63+
/// Directly append a view to the view array.
64+
/// This is used when we create a StringViewArray from a dictionary whose values are StringViewArray.
65+
///
66+
/// # Safety
67+
/// The `view` must be a valid view as per the ByteView spec.
68+
pub unsafe fn append_raw_view_unchecked(&mut self, view: &u128) {
69+
self.views.push(*view);
70+
}
71+
5972
/// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer`
6073
#[allow(unused)]
6174
pub fn into_array(self, null_buffer: Option<Buffer>, data_type: &ArrowType) -> ArrayRef {

0 commit comments

Comments
 (0)