@@ -154,6 +154,7 @@ use crate::reader::map_array::MapArrayDecoder;
154
154
use crate :: reader:: null_array:: NullArrayDecoder ;
155
155
use crate :: reader:: primitive_array:: PrimitiveArrayDecoder ;
156
156
use crate :: reader:: string_array:: StringArrayDecoder ;
157
+ use crate :: reader:: string_view_array:: StringViewArrayDecoder ;
157
158
use crate :: reader:: struct_array:: StructArrayDecoder ;
158
159
use crate :: reader:: tape:: { Tape , TapeDecoder } ;
159
160
use crate :: reader:: timestamp_array:: TimestampArrayDecoder ;
@@ -167,6 +168,7 @@ mod primitive_array;
167
168
mod schema;
168
169
mod serializer;
169
170
mod string_array;
171
+ mod string_view_array;
170
172
mod struct_array;
171
173
mod tape;
172
174
mod timestamp_array;
@@ -732,6 +734,7 @@ fn make_decoder(
732
734
DataType :: Decimal256 ( p, s) => Ok ( Box :: new( DecimalArrayDecoder :: <Decimal256Type >:: new( p, s) ) ) ,
733
735
DataType :: Boolean => Ok ( Box :: <BooleanArrayDecoder >:: default ( ) ) ,
734
736
DataType :: Utf8 => Ok ( Box :: new( StringArrayDecoder :: <i32 >:: new( coerce_primitive) ) ) ,
737
+ DataType :: Utf8View => Ok ( Box :: new( StringViewArrayDecoder :: new( coerce_primitive) ) ) ,
735
738
DataType :: LargeUtf8 => Ok ( Box :: new( StringArrayDecoder :: <i64 >:: new( coerce_primitive) ) ) ,
736
739
DataType :: List ( _) => Ok ( Box :: new( ListArrayDecoder :: <i32 >:: new( data_type, coerce_primitive, strict_mode, is_nullable, struct_mode) ?) ) ,
737
740
DataType :: LargeList ( _) => Ok ( Box :: new( ListArrayDecoder :: <i64 >:: new( data_type, coerce_primitive, strict_mode, is_nullable, struct_mode) ?) ) ,
@@ -751,7 +754,7 @@ mod tests {
751
754
use std:: io:: { BufReader , Cursor , Seek } ;
752
755
753
756
use arrow_array:: cast:: AsArray ;
754
- use arrow_array:: { Array , BooleanArray , Float64Array , ListArray , StringArray } ;
757
+ use arrow_array:: { Array , BooleanArray , Float64Array , ListArray , StringArray , StringViewArray } ;
755
758
use arrow_buffer:: { ArrowNativeType , Buffer } ;
756
759
use arrow_cast:: display:: { ArrayFormatter , FormatOptions } ;
757
760
use arrow_data:: ArrayDataBuilder ;
@@ -902,6 +905,145 @@ mod tests {
902
905
assert_eq ! ( col2. value( 4 ) , "" ) ;
903
906
}
904
907
908
+ #[ test]
909
+ fn test_long_string_view_allocation ( ) {
910
+ // The JSON input contains field "a" with different string lengths.
911
+ // According to the implementation in the decoder:
912
+ // - For a string, capacity is only increased if its length > 12 bytes.
913
+ // Therefore, for:
914
+ // Row 1: "short" (5 bytes) -> capacity += 0
915
+ // Row 2: "this is definitely long" (24 bytes) -> capacity += 24
916
+ // Row 3: "hello" (5 bytes) -> capacity += 0
917
+ // Row 4: "\nfoobar😀asfgÿ" (17 bytes) -> capacity += 17
918
+ // Expected total capacity = 24 + 17 = 41
919
+ let expected_capacity: usize = 41 ;
920
+
921
+ let buf = r#"
922
+ {"a": "short", "b": "dummy"}
923
+ {"a": "this is definitely long", "b": "dummy"}
924
+ {"a": "hello", "b": "dummy"}
925
+ {"a": "\nfoobar😀asfgÿ", "b": "dummy"}
926
+ "# ;
927
+
928
+ let schema = Arc :: new ( Schema :: new ( vec ! [
929
+ Field :: new( "a" , DataType :: Utf8View , true ) ,
930
+ Field :: new( "b" , DataType :: LargeUtf8 , true ) ,
931
+ ] ) ) ;
932
+
933
+ let batches = do_read ( buf, 1024 , false , false , schema) ;
934
+ assert_eq ! ( batches. len( ) , 1 , "Expected one record batch" ) ;
935
+
936
+ // Get the first column ("a") as a StringViewArray.
937
+ let col_a = batches[ 0 ] . column ( 0 ) ;
938
+ let string_view_array = col_a
939
+ . as_any ( )
940
+ . downcast_ref :: < StringViewArray > ( )
941
+ . expect ( "Column should be a StringViewArray" ) ;
942
+
943
+ // Retrieve the underlying data buffer from the array.
944
+ // The builder pre-allocates capacity based on the sum of lengths for long strings.
945
+ let data_buffer = string_view_array. to_data ( ) . buffers ( ) [ 0 ] . len ( ) ;
946
+
947
+ // Check that the allocated capacity is at least what we expected.
948
+ // (The actual buffer may be larger than expected due to rounding or internal allocation strategies.)
949
+ assert ! (
950
+ data_buffer >= expected_capacity,
951
+ "Data buffer length ({}) should be at least {}" ,
952
+ data_buffer,
953
+ expected_capacity
954
+ ) ;
955
+
956
+ // Additionally, verify that the decoded values are correct.
957
+ assert_eq ! ( string_view_array. value( 0 ) , "short" ) ;
958
+ assert_eq ! ( string_view_array. value( 1 ) , "this is definitely long" ) ;
959
+ assert_eq ! ( string_view_array. value( 2 ) , "hello" ) ;
960
+ assert_eq ! ( string_view_array. value( 3 ) , "\n foobar😀asfgÿ" ) ;
961
+ }
962
+
963
+ /// Test the memory capacity allocation logic when converting numeric types to strings.
964
+ #[ test]
965
+ fn test_numeric_view_allocation ( ) {
966
+ // For numeric types, the expected capacity calculation is as follows:
967
+ // Row 1: 123456789 -> Number converts to the string "123456789" (length 9), 9 <= 12, so no capacity is added.
968
+ // Row 2: 1000000000000 -> Treated as an I64 number; its string is "1000000000000" (length 13),
969
+ // which is >12 and its absolute value is > 999_999_999_999, so 13 bytes are added.
970
+ // Row 3: 3.1415 -> F32 number, a fixed estimate of 10 bytes is added.
971
+ // Row 4: 2.718281828459045 -> F64 number, a fixed estimate of 10 bytes is added.
972
+ // Total expected capacity = 13 + 10 + 10 = 33 bytes.
973
+ let expected_capacity: usize = 33 ;
974
+
975
+ let buf = r#"
976
+ {"n": 123456789}
977
+ {"n": 1000000000000}
978
+ {"n": 3.1415}
979
+ {"n": 2.718281828459045}
980
+ "# ;
981
+
982
+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new( "n" , DataType :: Utf8View , true ) ] ) ) ;
983
+
984
+ let batches = do_read ( buf, 1024 , true , false , schema) ;
985
+ assert_eq ! ( batches. len( ) , 1 , "Expected one record batch" ) ;
986
+
987
+ let col_n = batches[ 0 ] . column ( 0 ) ;
988
+ let string_view_array = col_n
989
+ . as_any ( )
990
+ . downcast_ref :: < StringViewArray > ( )
991
+ . expect ( "Column should be a StringViewArray" ) ;
992
+
993
+ // Check that the underlying data buffer capacity is at least the expected value.
994
+ let data_buffer = string_view_array. to_data ( ) . buffers ( ) [ 0 ] . len ( ) ;
995
+ assert ! (
996
+ data_buffer >= expected_capacity,
997
+ "Data buffer length ({}) should be at least {}" ,
998
+ data_buffer,
999
+ expected_capacity
1000
+ ) ;
1001
+
1002
+ // Verify that the converted string values are correct.
1003
+ // Note: The format of the number converted to a string should match the actual implementation.
1004
+ assert_eq ! ( string_view_array. value( 0 ) , "123456789" ) ;
1005
+ assert_eq ! ( string_view_array. value( 1 ) , "1000000000000" ) ;
1006
+ assert_eq ! ( string_view_array. value( 2 ) , "3.1415" ) ;
1007
+ assert_eq ! ( string_view_array. value( 3 ) , "2.718281828459045" ) ;
1008
+ }
1009
+
1010
+ #[ test]
1011
+ fn test_string_with_uft8view ( ) {
1012
+ let buf = r#"
1013
+ {"a": "1", "b": "2"}
1014
+ {"a": "hello", "b": "shoo"}
1015
+ {"b": "\t😁foo", "a": "\nfoobar\ud83d\ude00\u0061\u0073\u0066\u0067\u00FF"}
1016
+
1017
+ {"b": null}
1018
+ {"b": "", "a": null}
1019
+
1020
+ "# ;
1021
+ let schema = Arc :: new ( Schema :: new ( vec ! [
1022
+ Field :: new( "a" , DataType :: Utf8View , true ) ,
1023
+ Field :: new( "b" , DataType :: LargeUtf8 , true ) ,
1024
+ ] ) ) ;
1025
+
1026
+ let batches = do_read ( buf, 1024 , false , false , schema) ;
1027
+ assert_eq ! ( batches. len( ) , 1 ) ;
1028
+
1029
+ let col1 = batches[ 0 ] . column ( 0 ) . as_string_view ( ) ;
1030
+ assert_eq ! ( col1. null_count( ) , 2 ) ;
1031
+ assert_eq ! ( col1. value( 0 ) , "1" ) ;
1032
+ assert_eq ! ( col1. value( 1 ) , "hello" ) ;
1033
+ assert_eq ! ( col1. value( 2 ) , "\n foobar😀asfgÿ" ) ;
1034
+ assert ! ( col1. is_null( 3 ) ) ;
1035
+ assert ! ( col1. is_null( 4 ) ) ;
1036
+ assert_eq ! ( col1. data_type( ) , & DataType :: Utf8View ) ;
1037
+
1038
+ let col2 = batches[ 0 ] . column ( 1 ) . as_string :: < i64 > ( ) ;
1039
+ assert_eq ! ( col2. null_count( ) , 1 ) ;
1040
+ assert_eq ! ( col2. value( 0 ) , "2" ) ;
1041
+ assert_eq ! ( col2. value( 1 ) , "shoo" ) ;
1042
+ assert_eq ! ( col2. value( 2 ) , "\t 😁foo" ) ;
1043
+ assert ! ( col2. is_null( 3 ) ) ;
1044
+ assert_eq ! ( col2. value( 4 ) , "" ) ;
1045
+ }
1046
+
905
1047
#[ test]
906
1048
fn test_complex ( ) {
907
1049
let buf = r#"
0 commit comments