@@ -20,7 +20,7 @@ use arrow_array::builder::BufferBuilder;
20
20
use arrow_array:: * ;
21
21
use arrow_buffer:: bit_util:: ceil;
22
22
use arrow_buffer:: MutableBuffer ;
23
- use arrow_data:: { ArrayDataBuilder , ByteView } ;
23
+ use arrow_data:: ArrayDataBuilder ;
24
24
use arrow_schema:: { DataType , SortOptions } ;
25
25
use builder:: make_view;
26
26
@@ -259,62 +259,64 @@ fn decode_binary_view_inner(
259
259
valid
260
260
} ) ;
261
261
262
- let values_capacity: usize = rows. iter ( ) . map ( |row| decoded_len ( row, options) ) . sum ( ) ;
263
- let mut values = MutableBuffer :: new ( values_capacity) ;
262
+ // we create two buffer, the inline buffer is used for quick utf8 validation.
263
+ let mut output_buffer_cap = 0 ;
264
+ let mut inline_buffer_cap = 0 ;
265
+ for r in rows. iter ( ) {
266
+ let len = decoded_len ( r, options) ;
267
+ if len > 12 {
268
+ output_buffer_cap += len;
269
+ } else {
270
+ inline_buffer_cap += len;
271
+ }
272
+ }
273
+
274
+ let mut output_buffer = MutableBuffer :: new ( output_buffer_cap) ;
275
+ let mut inline_buffer = MutableBuffer :: new ( inline_buffer_cap) ;
264
276
let mut views = BufferBuilder :: < u128 > :: new ( len) ;
265
277
266
278
for row in rows {
267
- let start_offset = values. len ( ) ;
268
- let offset = decode_blocks ( row, options, |b| values. extend_from_slice ( b) ) ;
279
+ let start_offset = output_buffer. len ( ) ;
280
+ let offset = decode_blocks ( row, options, |b| {
281
+ let val = if b. len ( ) <= 12 {
282
+ let old_len = inline_buffer. len ( ) ;
283
+ inline_buffer. extend_from_slice ( b) ;
284
+ // Safety: we just extended the buffer with the length of `b`
285
+ unsafe { inline_buffer. get_unchecked_mut ( old_len..) }
286
+ } else {
287
+ output_buffer. extend_from_slice ( b) ;
288
+ debug_assert_eq ! ( b, & output_buffer[ start_offset..] ) ;
289
+ // Safety: we just extended the buffer with the length of `b`
290
+ unsafe { output_buffer. get_unchecked_mut ( start_offset..) }
291
+ } ;
292
+ if options. descending {
293
+ val. iter_mut ( ) . for_each ( |o| * o = !* o) ;
294
+ }
295
+
296
+ let view = make_view ( val, 0 , start_offset as u32 ) ;
297
+ views. append ( view) ;
298
+ } ) ;
269
299
if row[ 0 ] == null_sentinel ( options) {
270
300
debug_assert_eq ! ( offset, 1 ) ;
271
- debug_assert_eq ! ( start_offset, values . len( ) ) ;
301
+ debug_assert_eq ! ( start_offset, output_buffer . len( ) ) ;
272
302
views. append ( 0 ) ;
273
- } else {
274
- let view = make_view (
275
- unsafe { values. get_unchecked ( start_offset..) } ,
276
- 0 ,
277
- start_offset as u32 ,
278
- ) ;
279
- views. append ( view) ;
280
303
}
281
304
* row = & row[ offset..] ;
282
305
}
283
306
284
- if options. descending {
285
- values. as_slice_mut ( ) . iter_mut ( ) . for_each ( |o| * o = !* o) ;
286
- for view in views. as_slice_mut ( ) {
287
- let len = * view as u32 ;
288
- if len <= 12 {
289
- let mut bytes = view. to_le_bytes ( ) ;
290
- bytes
291
- . iter_mut ( )
292
- . skip ( 4 )
293
- . take ( len as usize )
294
- . for_each ( |o| * o = !* o) ;
295
- * view = u128:: from_le_bytes ( bytes) ;
296
- } else {
297
- let mut byte_view = ByteView :: from ( * view) ;
298
- let mut prefix = byte_view. prefix . to_le_bytes ( ) ;
299
- prefix. iter_mut ( ) . for_each ( |o| * o = !* o) ;
300
- byte_view. prefix = u32:: from_le_bytes ( prefix) ;
301
- * view = byte_view. into ( ) ;
302
- }
303
- }
304
- }
305
-
306
307
if check_utf8 {
307
- // the values contains all data, no matter if it is short or long
308
- // we can validate utf8 in one go.
309
- std:: str:: from_utf8 ( values. as_slice ( ) ) . unwrap ( ) ;
308
+ // We validate the utf8 of the output buffer and the inline buffer
309
+ // This is much faster than validating each string individually
310
+ std:: str:: from_utf8 ( output_buffer. as_slice ( ) ) . unwrap ( ) ;
311
+ std:: str:: from_utf8 ( inline_buffer. as_slice ( ) ) . unwrap ( ) ;
310
312
}
311
313
312
314
let builder = ArrayDataBuilder :: new ( DataType :: BinaryView )
313
315
. len ( len)
314
316
. null_count ( null_count)
315
317
. null_bit_buffer ( Some ( nulls. into ( ) ) )
316
318
. add_buffer ( views. finish ( ) )
317
- . add_buffer ( values . into ( ) ) ;
319
+ . add_buffer ( output_buffer . into ( ) ) ;
318
320
319
321
// SAFETY:
320
322
// Valid by construction above
0 commit comments