Skip to content

Commit df486d1

Browse files
committed
update comments
1 parent c8bbc84 commit df486d1

File tree

1 file changed

+44
-33
lines changed

1 file changed

+44
-33
lines changed

parquet/src/arrow/array_reader/byte_view_array.rs

+44-33
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,7 @@ impl ByteViewArrayDecoderDeltaLength {
516516
// # Safety
517517
// The length is from the delta length decoder, so it is valid
518518
// The start_offset is calculated from the lengths, so it is valid
519-
// `start_offset` + *length is guaranteed to be within the bounds of `data`, as checked in `new`
519+
// `start_offset + length` is guaranteed to be within the bounds of `data`, as checked in `new`
520520
unsafe { output.append_view_unchecked(block_id, current_offset as u32, *length as u32) }
521521

522522
current_offset += *length as usize;
@@ -562,51 +562,62 @@ impl ByteViewArrayDecoderDelta {
562562

563563
// Unlike other encodings, we need to copy the data.
564564
//
565-
// DeltaByteArray data is stored using shared prefixes/suffixes,
565+
// DeltaByteArray data is stored using shared prefixes/suffixes,
566566
// which results in potentially non-contiguous
567567
// strings, while Arrow encodings require contiguous strings
568568
//
569569
// <https://parquet.apache.org/docs/file-format/data-pages/encodings/#delta-strings-delta_byte_array--7>
570-
570+
571571
fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result<usize> {
572572
output.views.reserve(len.min(self.decoder.remaining()));
573573

574574
// array buffer only have long strings
575575
let mut array_buffer: Vec<u8> = Vec::with_capacity(4096);
576576

577-
// utf8 validation buffer have all strings, we batch the strings in one buffer to accelerate validation
578-
let mut utf8_validation_buffer = if self.validate_utf8 {
579-
Some(Vec::with_capacity(4096))
580-
} else {
581-
None
582-
};
583-
584577
let buffer_id = output.buffers.len() as u32;
585578

586-
let read = self.decoder.read(len, |bytes| {
587-
let offset = array_buffer.len();
588-
let view = make_view(bytes, buffer_id, offset as u32);
589-
if bytes.len() > 12 {
590-
// only copy the data to buffer if the string can not be inlined.
591-
array_buffer.extend_from_slice(bytes);
592-
}
593-
if let Some(v) = utf8_validation_buffer.as_mut() {
594-
v.extend_from_slice(bytes);
595-
}
596-
597-
// # Safety
598-
// The buffer_id is the last buffer in the output buffers
599-
// The offset is calculated from the buffer, so it is valid
600-
// Utf-8 validation is done later
601-
unsafe {
602-
output.append_raw_view_unchecked(&view);
603-
}
604-
Ok(())
605-
})?;
579+
let read = if !self.validate_utf8 {
580+
self.decoder.read(len, |bytes| {
581+
let offset = array_buffer.len();
582+
let view = make_view(bytes, buffer_id, offset as u32);
583+
if bytes.len() > 12 {
584+
// only copy the data to buffer if the string can not be inlined.
585+
array_buffer.extend_from_slice(bytes);
586+
}
606587

607-
utf8_validation_buffer
608-
.map(|v| check_valid_utf8(&v))
609-
.transpose()?;
588+
// # Safety
589+
// The buffer_id is the last buffer in the output buffers
590+
// The offset is calculated from the buffer, so it is valid
591+
unsafe {
592+
output.append_raw_view_unchecked(&view);
593+
}
594+
Ok(())
595+
})?
596+
} else {
597+
// utf8 validation buffer have all strings, we batch the strings in one buffer to accelerate validation
598+
let mut utf8_validation_buffer = Vec::with_capacity(4096);
599+
600+
let v = self.decoder.read(len, |bytes| {
601+
let offset = array_buffer.len();
602+
let view = make_view(bytes, buffer_id, offset as u32);
603+
if bytes.len() > 12 {
604+
// only copy the data to buffer if the string can not be inlined.
605+
array_buffer.extend_from_slice(bytes);
606+
}
607+
utf8_validation_buffer.extend_from_slice(bytes);
608+
609+
// # Safety
610+
// The buffer_id is the last buffer in the output buffers
611+
// The offset is calculated from the buffer, so it is valid
612+
// Utf-8 validation is done later
613+
unsafe {
614+
output.append_raw_view_unchecked(&view);
615+
}
616+
Ok(())
617+
})?;
618+
check_valid_utf8(&utf8_validation_buffer)?;
619+
v
620+
};
610621

611622
let actual_block_id = output.append_block(array_buffer.into());
612623
assert_eq!(actual_block_id, buffer_id);

0 commit comments

Comments
 (0)