17
17
18
18
use crate :: arrow:: array_reader:: { read_records, skip_records, ArrayReader } ;
19
19
use crate :: arrow:: buffer:: view_buffer:: ViewBuffer ;
20
+ use crate :: arrow:: decoder:: DictIndexDecoder ;
20
21
use crate :: arrow:: record_reader:: GenericRecordReader ;
21
22
use crate :: arrow:: schema:: parquet_to_arrow_field;
22
23
use crate :: basic:: { ConvertedType , Encoding } ;
@@ -25,6 +26,7 @@ use crate::column::reader::decoder::ColumnValueDecoder;
25
26
use crate :: errors:: { ParquetError , Result } ;
26
27
use crate :: schema:: types:: ColumnDescPtr ;
27
28
use arrow_array:: ArrayRef ;
29
+ use arrow_data:: ByteView ;
28
30
use arrow_schema:: DataType as ArrowType ;
29
31
use bytes:: Bytes ;
30
32
use std:: any:: Any ;
@@ -210,6 +212,7 @@ impl ColumnValueDecoder for ByteViewArrayColumnValueDecoder {
210
212
/// A generic decoder from uncompressed parquet value data to [`ViewBuffer`]
211
213
pub enum ByteViewArrayDecoder {
212
214
Plain ( ByteViewArrayDecoderPlain ) ,
215
+ Dictionary ( ByteViewArrayDecoderDictionary ) ,
213
216
}
214
217
215
218
impl ByteViewArrayDecoder {
@@ -227,10 +230,14 @@ impl ByteViewArrayDecoder {
227
230
num_values,
228
231
validate_utf8,
229
232
) ) ,
230
- Encoding :: RLE_DICTIONARY
231
- | Encoding :: PLAIN_DICTIONARY
232
- | Encoding :: DELTA_LENGTH_BYTE_ARRAY
233
- | Encoding :: DELTA_BYTE_ARRAY => unimplemented ! ( "stay tuned!" ) ,
233
+ Encoding :: RLE_DICTIONARY | Encoding :: PLAIN_DICTIONARY => {
234
+ ByteViewArrayDecoder :: Dictionary ( ByteViewArrayDecoderDictionary :: new (
235
+ data, num_levels, num_values,
236
+ ) )
237
+ }
238
+ Encoding :: DELTA_LENGTH_BYTE_ARRAY | Encoding :: DELTA_BYTE_ARRAY => {
239
+ unimplemented ! ( "stay tuned!" )
240
+ }
234
241
_ => {
235
242
return Err ( general_err ! (
236
243
"unsupported encoding for byte array: {}" ,
@@ -247,17 +254,27 @@ impl ByteViewArrayDecoder {
247
254
& mut self ,
248
255
out : & mut ViewBuffer ,
249
256
len : usize ,
250
- _dict : Option < & ViewBuffer > ,
257
+ dict : Option < & ViewBuffer > ,
251
258
) -> Result < usize > {
252
259
match self {
253
260
ByteViewArrayDecoder :: Plain ( d) => d. read ( out, len) ,
261
+ ByteViewArrayDecoder :: Dictionary ( d) => {
262
+ let dict = dict
263
+ . ok_or_else ( || general_err ! ( "dictionary required for dictionary encoding" ) ) ?;
264
+ d. read ( out, dict, len)
265
+ }
254
266
}
255
267
}
256
268
257
269
/// Skip `len` values
258
- pub fn skip ( & mut self , len : usize , _dict : Option < & ViewBuffer > ) -> Result < usize > {
270
+ pub fn skip ( & mut self , len : usize , dict : Option < & ViewBuffer > ) -> Result < usize > {
259
271
match self {
260
272
ByteViewArrayDecoder :: Plain ( d) => d. skip ( len) ,
273
+ ByteViewArrayDecoder :: Dictionary ( d) => {
274
+ let dict = dict
275
+ . ok_or_else ( || general_err ! ( "dictionary required for dictionary encoding" ) ) ?;
276
+ d. skip ( dict, len)
277
+ }
261
278
}
262
279
}
263
280
}
@@ -348,6 +365,90 @@ impl ByteViewArrayDecoderPlain {
348
365
}
349
366
}
350
367
368
+ pub struct ByteViewArrayDecoderDictionary {
369
+ decoder : DictIndexDecoder ,
370
+ }
371
+
372
+ impl ByteViewArrayDecoderDictionary {
373
+ fn new ( data : Bytes , num_levels : usize , num_values : Option < usize > ) -> Self {
374
+ Self {
375
+ decoder : DictIndexDecoder :: new ( data, num_levels, num_values) ,
376
+ }
377
+ }
378
+
379
+ /// Reads the next indexes from self.decoder
380
+ /// the indexes are assumed to be indexes into `dict`
381
+ /// the output values are written to output
382
+ ///
383
+ /// Assumptions / Optimization
384
+ /// This function checks if dict.buffers() are the last buffers in `output`, and if so
385
+ /// reuses the dictionary page buffers directly without copying data
386
+ fn read ( & mut self , output : & mut ViewBuffer , dict : & ViewBuffer , len : usize ) -> Result < usize > {
387
+ if dict. is_empty ( ) || len == 0 {
388
+ return Ok ( 0 ) ;
389
+ }
390
+
391
+ // Check if the last few buffer of `output`` are the same as the `dict` buffer
392
+ // This is to avoid creating a new buffers if the same dictionary is used for multiple `read`
393
+ let need_to_create_new_buffer = {
394
+ if output. buffers . len ( ) >= dict. buffers . len ( ) {
395
+ let offset = output. buffers . len ( ) - dict. buffers . len ( ) ;
396
+ output. buffers [ offset..]
397
+ . iter ( )
398
+ . zip ( dict. buffers . iter ( ) )
399
+ . any ( |( a, b) | !a. ptr_eq ( b) )
400
+ } else {
401
+ true
402
+ }
403
+ } ;
404
+
405
+ if need_to_create_new_buffer {
406
+ for b in dict. buffers . iter ( ) {
407
+ output. buffers . push ( b. clone ( ) ) ;
408
+ }
409
+ }
410
+
411
+ // Calculate the offset of the dictionary buffers in the output buffers
412
+ // For example if the 2nd buffer in the dictionary is the 5th buffer in the output buffers,
413
+ // then the base_buffer_idx is 5 - 2 = 3
414
+ let base_buffer_idx = output. buffers . len ( ) as u32 - dict. buffers . len ( ) as u32 ;
415
+
416
+ self . decoder . read ( len, |keys| {
417
+ for k in keys {
418
+ let view = dict
419
+ . views
420
+ . get ( * k as usize )
421
+ . ok_or_else ( || general_err ! ( "invalid key={} for dictionary" , * k) ) ?;
422
+ let len = * view as u32 ;
423
+ if len <= 12 {
424
+ // directly append the view if it is inlined
425
+ // Safety: the view is from the dictionary, so it is valid
426
+ unsafe {
427
+ output. append_raw_view_unchecked ( view) ;
428
+ }
429
+ } else {
430
+ // correct the buffer index and append the view
431
+ let mut view = ByteView :: from ( * view) ;
432
+ view. buffer_index += base_buffer_idx;
433
+ // Safety: the view is from the dictionary,
434
+ // we corrected the index value to point it to output buffer, so it is valid
435
+ unsafe {
436
+ output. append_raw_view_unchecked ( & view. into ( ) ) ;
437
+ }
438
+ }
439
+ }
440
+ Ok ( ( ) )
441
+ } )
442
+ }
443
+
444
+ fn skip ( & mut self , dict : & ViewBuffer , to_skip : usize ) -> Result < usize > {
445
+ if dict. is_empty ( ) {
446
+ return Ok ( 0 ) ;
447
+ }
448
+ self . decoder . skip ( to_skip)
449
+ }
450
+ }
451
+
351
452
/// Check that `val` is a valid UTF-8 sequence
352
453
pub fn check_valid_utf8 ( val : & [ u8 ] ) -> Result < ( ) > {
353
454
match std:: str:: from_utf8 ( val) {
@@ -386,8 +487,11 @@ mod tests {
386
487
. unwrap ( ) ;
387
488
388
489
for ( encoding, page) in pages {
389
- if encoding != Encoding :: PLAIN {
390
- // skip non-plain encodings for now as they are not yet implemented
490
+ if encoding != Encoding :: PLAIN
491
+ && encoding != Encoding :: RLE_DICTIONARY
492
+ && encoding != Encoding :: PLAIN_DICTIONARY
493
+ {
494
+ // skip unsupported encodings for now as they are not yet implemented
391
495
continue ;
392
496
}
393
497
let mut output = ViewBuffer :: default ( ) ;
@@ -399,7 +503,6 @@ mod tests {
399
503
assert_eq ! ( decoder. read( & mut output, 4 ) . unwrap( ) , 0 ) ;
400
504
401
505
assert_eq ! ( output. views. len( ) , 4 ) ;
402
- assert_eq ! ( output. buffers. len( ) , 4 ) ;
403
506
404
507
let valid = [ false , false , true , true , false , true , true , false , false ] ;
405
508
let valid_buffer = Buffer :: from_iter ( valid. iter ( ) . cloned ( ) ) ;
0 commit comments