@@ -516,7 +516,7 @@ impl ByteViewArrayDecoderDeltaLength {
516
516
// # Safety
517
517
// The length is from the delta length decoder, so it is valid
518
518
// The start_offset is calculated from the lengths, so it is valid
519
- // `start_offset` + * length is guaranteed to be within the bounds of `data`, as checked in `new`
519
+ // `start_offset + length` is guaranteed to be within the bounds of `data`, as checked in `new`
520
520
unsafe { output. append_view_unchecked ( block_id, current_offset as u32 , * length as u32 ) }
521
521
522
522
current_offset += * length as usize ;
@@ -562,51 +562,62 @@ impl ByteViewArrayDecoderDelta {
562
562
563
563
// Unlike other encodings, we need to copy the data.
564
564
//
565
- // DeltaByteArray data is stored using shared prefixes/suffixes,
565
+ // DeltaByteArray data is stored using shared prefixes/suffixes,
566
566
// which results in potentially non-contiguous
567
567
// strings, while Arrow encodings require contiguous strings
568
568
//
569
569
// <https://parquet.apache.org/docs/file-format/data-pages/encodings/#delta-strings-delta_byte_array--7>
570
-
570
+
571
571
fn read ( & mut self , output : & mut ViewBuffer , len : usize ) -> Result < usize > {
572
572
output. views . reserve ( len. min ( self . decoder . remaining ( ) ) ) ;
573
573
574
574
// array buffer only have long strings
575
575
let mut array_buffer: Vec < u8 > = Vec :: with_capacity ( 4096 ) ;
576
576
577
- // utf8 validation buffer have all strings, we batch the strings in one buffer to accelerate validation
578
- let mut utf8_validation_buffer = if self . validate_utf8 {
579
- Some ( Vec :: with_capacity ( 4096 ) )
580
- } else {
581
- None
582
- } ;
583
-
584
577
let buffer_id = output. buffers . len ( ) as u32 ;
585
578
586
- let read = self . decoder . read ( len, |bytes| {
587
- let offset = array_buffer. len ( ) ;
588
- let view = make_view ( bytes, buffer_id, offset as u32 ) ;
589
- if bytes. len ( ) > 12 {
590
- // only copy the data to buffer if the string can not be inlined.
591
- array_buffer. extend_from_slice ( bytes) ;
592
- }
593
- if let Some ( v) = utf8_validation_buffer. as_mut ( ) {
594
- v. extend_from_slice ( bytes) ;
595
- }
596
-
597
- // # Safety
598
- // The buffer_id is the last buffer in the output buffers
599
- // The offset is calculated from the buffer, so it is valid
600
- // Utf-8 validation is done later
601
- unsafe {
602
- output. append_raw_view_unchecked ( & view) ;
603
- }
604
- Ok ( ( ) )
605
- } ) ?;
579
+ let read = if !self . validate_utf8 {
580
+ self . decoder . read ( len, |bytes| {
581
+ let offset = array_buffer. len ( ) ;
582
+ let view = make_view ( bytes, buffer_id, offset as u32 ) ;
583
+ if bytes. len ( ) > 12 {
584
+ // only copy the data to buffer if the string can not be inlined.
585
+ array_buffer. extend_from_slice ( bytes) ;
586
+ }
606
587
607
- utf8_validation_buffer
608
- . map ( |v| check_valid_utf8 ( & v) )
609
- . transpose ( ) ?;
588
+ // # Safety
589
+ // The buffer_id is the last buffer in the output buffers
590
+ // The offset is calculated from the buffer, so it is valid
591
+ unsafe {
592
+ output. append_raw_view_unchecked ( & view) ;
593
+ }
594
+ Ok ( ( ) )
595
+ } ) ?
596
+ } else {
597
+ // utf8 validation buffer have all strings, we batch the strings in one buffer to accelerate validation
598
+ let mut utf8_validation_buffer = Vec :: with_capacity ( 4096 ) ;
599
+
600
+ let v = self . decoder . read ( len, |bytes| {
601
+ let offset = array_buffer. len ( ) ;
602
+ let view = make_view ( bytes, buffer_id, offset as u32 ) ;
603
+ if bytes. len ( ) > 12 {
604
+ // only copy the data to buffer if the string can not be inlined.
605
+ array_buffer. extend_from_slice ( bytes) ;
606
+ }
607
+ utf8_validation_buffer. extend_from_slice ( bytes) ;
608
+
609
+ // # Safety
610
+ // The buffer_id is the last buffer in the output buffers
611
+ // The offset is calculated from the buffer, so it is valid
612
+ // Utf-8 validation is done later
613
+ unsafe {
614
+ output. append_raw_view_unchecked ( & view) ;
615
+ }
616
+ Ok ( ( ) )
617
+ } ) ?;
618
+ check_valid_utf8 ( & utf8_validation_buffer) ?;
619
+ v
620
+ } ;
610
621
611
622
let actual_block_id = output. append_block ( array_buffer. into ( ) ) ;
612
623
assert_eq ! ( actual_block_id, buffer_id) ;
0 commit comments