Skip to content

Commit e2656c9

Browse files
committed
address comments
1 parent 6451fe0 commit e2656c9

File tree

1 file changed

+41
-39
lines changed

1 file changed

+41
-39
lines changed

arrow-row/src/variable.rs

+41-39
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use arrow_array::builder::BufferBuilder;
2020
use arrow_array::*;
2121
use arrow_buffer::bit_util::ceil;
2222
use arrow_buffer::MutableBuffer;
23-
use arrow_data::{ArrayDataBuilder, ByteView};
23+
use arrow_data::ArrayDataBuilder;
2424
use arrow_schema::{DataType, SortOptions};
2525
use builder::make_view;
2626

@@ -259,62 +259,64 @@ fn decode_binary_view_inner(
259259
valid
260260
});
261261

262-
let values_capacity: usize = rows.iter().map(|row| decoded_len(row, options)).sum();
263-
let mut values = MutableBuffer::new(values_capacity);
262+
// we create two buffer, the inline buffer is used for quick utf8 validation.
263+
let mut output_buffer_cap = 0;
264+
let mut inline_buffer_cap = 0;
265+
for r in rows.iter() {
266+
let len = decoded_len(r, options);
267+
if len > 12 {
268+
output_buffer_cap += len;
269+
} else {
270+
inline_buffer_cap += len;
271+
}
272+
}
273+
274+
let mut output_buffer = MutableBuffer::new(output_buffer_cap);
275+
let mut inline_buffer = MutableBuffer::new(inline_buffer_cap);
264276
let mut views = BufferBuilder::<u128>::new(len);
265277

266278
for row in rows {
267-
let start_offset = values.len();
268-
let offset = decode_blocks(row, options, |b| values.extend_from_slice(b));
279+
let start_offset = output_buffer.len();
280+
let offset = decode_blocks(row, options, |b| {
281+
let val = if b.len() <= 12 {
282+
let old_len = inline_buffer.len();
283+
inline_buffer.extend_from_slice(b);
284+
// Safety: we just extended the buffer with the length of `b`
285+
unsafe { inline_buffer.get_unchecked_mut(old_len..) }
286+
} else {
287+
output_buffer.extend_from_slice(b);
288+
debug_assert_eq!(b, &output_buffer[start_offset..]);
289+
// Safety: we just extended the buffer with the length of `b`
290+
unsafe { output_buffer.get_unchecked_mut(start_offset..) }
291+
};
292+
if options.descending {
293+
val.iter_mut().for_each(|o| *o = !*o);
294+
}
295+
296+
let view = make_view(val, 0, start_offset as u32);
297+
views.append(view);
298+
});
269299
if row[0] == null_sentinel(options) {
270300
debug_assert_eq!(offset, 1);
271-
debug_assert_eq!(start_offset, values.len());
301+
debug_assert_eq!(start_offset, output_buffer.len());
272302
views.append(0);
273-
} else {
274-
let view = make_view(
275-
unsafe { values.get_unchecked(start_offset..) },
276-
0,
277-
start_offset as u32,
278-
);
279-
views.append(view);
280303
}
281304
*row = &row[offset..];
282305
}
283306

284-
if options.descending {
285-
values.as_slice_mut().iter_mut().for_each(|o| *o = !*o);
286-
for view in views.as_slice_mut() {
287-
let len = *view as u32;
288-
if len <= 12 {
289-
let mut bytes = view.to_le_bytes();
290-
bytes
291-
.iter_mut()
292-
.skip(4)
293-
.take(len as usize)
294-
.for_each(|o| *o = !*o);
295-
*view = u128::from_le_bytes(bytes);
296-
} else {
297-
let mut byte_view = ByteView::from(*view);
298-
let mut prefix = byte_view.prefix.to_le_bytes();
299-
prefix.iter_mut().for_each(|o| *o = !*o);
300-
byte_view.prefix = u32::from_le_bytes(prefix);
301-
*view = byte_view.into();
302-
}
303-
}
304-
}
305-
306307
if check_utf8 {
307-
// the values contains all data, no matter if it is short or long
308-
// we can validate utf8 in one go.
309-
std::str::from_utf8(values.as_slice()).unwrap();
308+
// We validate the utf8 of the output buffer and the inline buffer
309+
// This is much faster than validating each string individually
310+
std::str::from_utf8(output_buffer.as_slice()).unwrap();
311+
std::str::from_utf8(inline_buffer.as_slice()).unwrap();
310312
}
311313

312314
let builder = ArrayDataBuilder::new(DataType::BinaryView)
313315
.len(len)
314316
.null_count(null_count)
315317
.null_bit_buffer(Some(nulls.into()))
316318
.add_buffer(views.finish())
317-
.add_buffer(values.into());
319+
.add_buffer(output_buffer.into());
318320

319321
// SAFETY:
320322
// Valid by construction above

0 commit comments

Comments
 (0)