@@ -18,7 +18,8 @@ use crate::{
1818
1919use crate :: arrow:: array:: {
2020 cast:: AsArray , make_array, new_null_array, Array as ArrowArray , BooleanArray , GenericListArray ,
21- MapArray , OffsetSizeTrait , PrimitiveArray , RecordBatch , StringArray , StructArray ,
21+ GenericStringArray , LargeStringArray , MapArray , OffsetSizeTrait , PrimitiveArray , RecordBatch ,
22+ StringArray , StructArray ,
2223} ;
2324use crate :: arrow:: buffer:: NullBuffer ;
2425use crate :: arrow:: compute:: concat_batches;
@@ -1012,24 +1013,35 @@ pub(crate) fn parse_json(
10121013 schema : SchemaRef ,
10131014) -> DeltaResult < Box < dyn EngineData > > {
10141015 let json_strings: RecordBatch = ArrowEngineData :: try_from_engine_data ( json_strings) ?. into ( ) ;
1015- let json_strings = json_strings
1016- . column ( 0 )
1017- . as_any ( )
1018- . downcast_ref :: < StringArray > ( )
1019- . ok_or_else ( || {
1020- Error :: generic ( "Expected json_strings to be a StringArray, found something else" )
1021- } ) ?;
1016+ let array_ref = json_strings. column ( 0 ) ;
10221017 let schema = Arc :: new ( ArrowSchema :: try_from_kernel ( schema. as_ref ( ) ) ?) ;
1023- let result = parse_json_impl ( json_strings, schema) ?;
1024- Ok ( Box :: new ( ArrowEngineData :: new ( result) ) )
1018+
1019+ // Try LargeStringArray first
1020+ if let Some ( large_strings) = array_ref. as_any ( ) . downcast_ref :: < LargeStringArray > ( ) {
1021+ let result = parse_json_impl ( large_strings, schema) ?;
1022+ return Ok ( Box :: new ( ArrowEngineData :: new ( result) ) ) ;
1023+ }
1024+
1025+ // Fall back to StringArray
1026+ if let Some ( strings) = array_ref. as_any ( ) . downcast_ref :: < StringArray > ( ) {
1027+ let result = parse_json_impl ( strings, schema) ?;
1028+ return Ok ( Box :: new ( ArrowEngineData :: new ( result) ) ) ;
1029+ }
1030+
1031+ Err ( Error :: generic (
1032+ "Expected json_strings to be a StringArray or LargeStringArray, found something else" ,
1033+ ) )
10251034}
10261035
10271036// Raw arrow implementation of the json parsing. Separate from the public function for testing.
10281037//
10291038// NOTE: This code is really inefficient because arrow lacks the native capability to perform robust
10301039// StringArray -> StructArray JSON parsing. See https://github.com/apache/arrow-rs/issues/6522. If
10311040// that shortcoming gets fixed upstream, this method can simplify or hopefully even disappear.
1032- fn parse_json_impl ( json_strings : & StringArray , schema : ArrowSchemaRef ) -> DeltaResult < RecordBatch > {
1041+ fn parse_json_impl < O : OffsetSizeTrait > (
1042+ json_strings : & GenericStringArray < O > ,
1043+ schema : ArrowSchemaRef ,
1044+ ) -> DeltaResult < RecordBatch > {
10331045 if json_strings. is_empty ( ) {
10341046 return Ok ( RecordBatch :: new_empty ( schema) ) ;
10351047 }
@@ -1231,45 +1243,45 @@ mod tests {
12311243 ArrowField :: new( "c" , ArrowDataType :: Int32 , true ) ,
12321244 ] ) ) ;
12331245 let input: Vec < & str > = vec ! [ ] ;
1234- let result = parse_json_impl ( & input. into ( ) , requested_schema. clone ( ) ) . unwrap ( ) ;
1246+ let result = parse_json_impl :: < i32 > ( & input. into ( ) , requested_schema. clone ( ) ) . unwrap ( ) ;
12351247 assert_eq ! ( result. num_rows( ) , 0 ) ;
12361248
12371249 let input: Vec < Option < & str > > = vec ! [ Some ( "" ) ] ;
1238- let result = parse_json_impl ( & input. into ( ) , requested_schema. clone ( ) ) ;
1250+ let result = parse_json_impl :: < i32 > ( & input. into ( ) , requested_schema. clone ( ) ) ;
12391251 result. expect_err ( "empty string" ) ;
12401252
12411253 let input: Vec < Option < & str > > = vec ! [ Some ( " \n \t " ) ] ;
1242- let result = parse_json_impl ( & input. into ( ) , requested_schema. clone ( ) ) ;
1254+ let result = parse_json_impl :: < i32 > ( & input. into ( ) , requested_schema. clone ( ) ) ;
12431255 result. expect_err ( "empty string" ) ;
12441256
12451257 let input: Vec < Option < & str > > = vec ! [ Some ( r#""a""# ) ] ;
1246- let result = parse_json_impl ( & input. into ( ) , requested_schema. clone ( ) ) ;
1258+ let result = parse_json_impl :: < i32 > ( & input. into ( ) , requested_schema. clone ( ) ) ;
12471259 result. expect_err ( "invalid string" ) ;
12481260
12491261 let input: Vec < Option < & str > > = vec ! [ Some ( r#"{ "a": 1"# ) ] ;
1250- let result = parse_json_impl ( & input. into ( ) , requested_schema. clone ( ) ) ;
1262+ let result = parse_json_impl :: < i32 > ( & input. into ( ) , requested_schema. clone ( ) ) ;
12511263 result. expect_err ( "incomplete object" ) ;
12521264
12531265 let input: Vec < Option < & str > > = vec ! [ Some ( "{}{}" ) ] ;
1254- let result = parse_json_impl ( & input. into ( ) , requested_schema. clone ( ) ) ;
1266+ let result = parse_json_impl :: < i32 > ( & input. into ( ) , requested_schema. clone ( ) ) ;
12551267 assert ! ( matches!(
12561268 result. unwrap_err( ) ,
12571269 Error :: Generic ( s) if s == "Malformed JSON: Multiple JSON objects"
12581270 ) ) ;
12591271
12601272 let input: Vec < Option < & str > > = vec ! [ Some ( r#"{} { "a": 1"# ) ] ;
1261- let result = parse_json_impl ( & input. into ( ) , requested_schema. clone ( ) ) ;
1273+ let result = parse_json_impl :: < i32 > ( & input. into ( ) , requested_schema. clone ( ) ) ;
12621274 assert ! ( matches!(
12631275 result. unwrap_err( ) ,
12641276 Error :: Generic ( s) if s == "Malformed JSON: Multiple JSON objects"
12651277 ) ) ;
12661278
12671279 let input: Vec < Option < & str > > = vec ! [ Some ( r#"{ "a": 1"# ) , Some ( r#", "b"}"# ) ] ;
1268- let result = parse_json_impl ( & input. into ( ) , requested_schema. clone ( ) ) ;
1280+ let result = parse_json_impl :: < i32 > ( & input. into ( ) , requested_schema. clone ( ) ) ;
12691281 result. expect_err ( "split object" ) ;
12701282
12711283 let input: Vec < Option < & str > > = vec ! [ None , Some ( r#"{"a": 1, "b": "2", "c": 3}"# ) , None ] ;
1272- let result = parse_json_impl ( & input. into ( ) , requested_schema. clone ( ) ) . unwrap ( ) ;
1284+ let result = parse_json_impl :: < i32 > ( & input. into ( ) , requested_schema. clone ( ) ) . unwrap ( ) ;
12731285 assert_eq ! ( result. num_rows( ) , 3 ) ;
12741286 assert_eq ! ( result. column( 0 ) . null_count( ) , 2 ) ;
12751287 assert_eq ! ( result. column( 1 ) . null_count( ) , 2 ) ;
@@ -1288,7 +1300,7 @@ mod tests {
12881300 let json_string = format ! ( r#"{{"long_val": "{long_string}"}}"# ) ;
12891301 let input: Vec < Option < & str > > = vec ! [ Some ( & json_string) ] ;
12901302
1291- let batch = parse_json_impl ( & input. into ( ) , schema. clone ( ) ) . unwrap ( ) ;
1303+ let batch = parse_json_impl :: < i32 > ( & input. into ( ) , schema. clone ( ) ) . unwrap ( ) ;
12921304 assert_eq ! ( batch. num_rows( ) , 1 ) ;
12931305 let long_col = batch. column ( 0 ) . as_string :: < i32 > ( ) ;
12941306 assert_eq ! ( long_col. value( 0 ) , long_string) ;
0 commit comments