diff --git a/acceptance/src/data.rs b/acceptance/src/data.rs index 95e4892282..0074ef1791 100644 --- a/acceptance/src/data.rs +++ b/acceptance/src/data.rs @@ -85,15 +85,22 @@ fn assert_schema_fields_match(schema: &Schema, golden: &Schema) { // some things are equivalent, but don't show up as equivalent for `==`, so we normalize here fn normalize_col(col: Arc) -> Arc { - if let DataType::Timestamp(unit, Some(zone)) = col.data_type() { - if **zone == *"+00:00" { - let data_type = DataType::Timestamp(*unit, Some("UTC".into())); - delta_kernel::arrow::compute::cast(&col, &data_type).expect("Could not cast to UTC") - } else { - col + match col.data_type() { + DataType::Timestamp(unit, Some(zone)) => { + if **zone == *"+00:00" { + let data_type = DataType::Timestamp(*unit, Some("UTC".into())); + delta_kernel::arrow::compute::cast(&col, &data_type).expect("Could not cast to UTC") + } else { + col + } + } + DataType::Utf8 => { + // just make everything LargeUtf8 + let data_type = DataType::LargeUtf8; + delta_kernel::arrow::compute::cast(&col, &data_type) + .expect("Could not cast to large utf8") } - } else { - col + _ => col, } } diff --git a/kernel/src/actions/mod.rs b/kernel/src/actions/mod.rs index ae7669912b..1531c1b876 100644 --- a/kernel/src/actions/mod.rs +++ b/kernel/src/actions/mod.rs @@ -1003,8 +1003,8 @@ mod tests { use super::*; use crate::{ arrow::array::{ - Array, BooleanArray, Int32Array, Int64Array, ListArray, ListBuilder, MapBuilder, - MapFieldNames, RecordBatch, StringArray, StringBuilder, StructArray, + Array, BooleanArray, Int32Array, Int64Array, LargeStringArray, LargeStringBuilder, + ListArray, ListBuilder, MapBuilder, MapFieldNames, RecordBatch, StructArray, }, arrow::datatypes::{DataType as ArrowDataType, Field, Schema}, arrow::json::ReaderBuilder, @@ -1044,19 +1044,19 @@ mod tests { fn create_string_map_builder( nullable_values: bool, - ) -> MapBuilder { + ) -> MapBuilder { MapBuilder::new( Some(MapFieldNames { entry: "key_value".to_string(), key: "key".to_string(), value: "value".to_string(), }), - StringBuilder::new(), - StringBuilder::new(), + LargeStringBuilder::new(), + LargeStringBuilder::new(), ) .with_values_field(Field::new( "value".to_string(), - ArrowDataType::Utf8, + ArrowDataType::LargeUtf8, nullable_values, )) } @@ -1527,7 +1527,7 @@ mod tests { .into(); let schema = Arc::new(Schema::new(vec![ - Field::new("appId", ArrowDataType::Utf8, false), + Field::new("appId", ArrowDataType::LargeUtf8, false), Field::new("version", ArrowDataType::Int64, false), Field::new("lastUpdated", ArrowDataType::Int64, true), ])); @@ -1535,7 +1535,7 @@ mod tests { let expected = RecordBatch::try_new( schema, vec![ - Arc::new(StringArray::from(vec!["app_id"])), + Arc::new(LargeStringArray::from(vec!["app_id"])), Arc::new(Int64Array::from(vec![0_i64])), Arc::new(Int64Array::from(vec![None::])), ], @@ -1570,11 +1570,13 @@ mod tests { vec![ Arc::new(Int64Array::from(vec![Some(0)])), Arc::new(Int64Array::from(vec![None::])), - Arc::new(StringArray::from(vec![Some("UNKNOWN")])), + Arc::new(LargeStringArray::from(vec![Some("UNKNOWN")])), operation_parameters, - Arc::new(StringArray::from(vec![Some(format!("v{KERNEL_VERSION}"))])), - Arc::new(StringArray::from(vec![None::])), - Arc::new(StringArray::from(vec![commit_info_txn_id])), + Arc::new(LargeStringArray::from(vec![Some(format!( + "v{KERNEL_VERSION}" + ))])), + Arc::new(LargeStringArray::from(vec![None::])), + Arc::new(LargeStringArray::from(vec![commit_info_txn_id])), ], ) .unwrap(); @@ -1605,8 +1607,8 @@ mod tests { let expected = RecordBatch::try_new( record_batch.schema(), vec![ - Arc::new(StringArray::from(vec!["my.domain"])), - Arc::new(StringArray::from(vec!["config_value"])), + Arc::new(LargeStringArray::from(vec!["my.domain"])), + Arc::new(LargeStringArray::from(vec!["config_value"])), Arc::new(BooleanArray::from(vec![false])), ], ) @@ -1859,7 +1861,7 @@ mod tests { .unwrap() .into(); - let list_field = Arc::new(Field::new("element", ArrowDataType::Utf8, false)); + let list_field = Arc::new(Field::new("element", ArrowDataType::LargeUtf8, false)); let protocol_fields = vec![ Field::new("minReaderVersion", ArrowDataType::Int32, false), Field::new("minWriterVersion", ArrowDataType::Int32, false), @@ -1876,13 +1878,13 @@ mod tests { ]; let schema = Arc::new(Schema::new(protocol_fields.clone())); - let string_builder = StringBuilder::new(); + let string_builder = LargeStringBuilder::new(); let mut list_builder = ListBuilder::new(string_builder).with_field(list_field.clone()); list_builder.values().append_value("columnMapping"); list_builder.append(true); let reader_features_array = list_builder.finish(); - let string_builder = StringBuilder::new(); + let string_builder = LargeStringBuilder::new(); let mut list_builder = ListBuilder::new(string_builder).with_field(list_field.clone()); list_builder.values().append_value("deletionVectors"); list_builder.append(true); diff --git a/kernel/src/engine/arrow_conversion.rs b/kernel/src/engine/arrow_conversion.rs index 42049d26aa..778aae51d1 100644 --- a/kernel/src/engine/arrow_conversion.rs +++ b/kernel/src/engine/arrow_conversion.rs @@ -122,7 +122,7 @@ impl TryFromKernel<&DataType> for ArrowDataType { match t { DataType::Primitive(p) => { match p { - PrimitiveType::String => Ok(ArrowDataType::Utf8), + PrimitiveType::String => Ok(ArrowDataType::LargeUtf8), PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type PrimitiveType::Integer => Ok(ArrowDataType::Int32), PrimitiveType::Short => Ok(ArrowDataType::Int16), diff --git a/kernel/src/engine/arrow_data.rs b/kernel/src/engine/arrow_data.rs index 1ce2154f82..baae8be17c 100644 --- a/kernel/src/engine/arrow_data.rs +++ b/kernel/src/engine/arrow_data.rs @@ -90,6 +90,11 @@ impl From> for RecordBatch { } } +/// Helper function to get a string value from an array using the specified offset size +fn get_string_value(arry: &dyn Array, index: usize) -> String { + arry.as_string::().value(index).to_string() +} + impl EngineList for GenericListArray where OffsetSize: OffsetSizeTrait, @@ -100,8 +105,11 @@ where fn get(&self, row_index: usize, index: usize) -> String { let arry = self.value(row_index); - let sarry = arry.as_string::(); - sarry.value(index).to_string() + match arry.data_type() { + ArrowDataType::LargeUtf8 => get_string_value::(arry.as_ref(), index), + ArrowDataType::Utf8 => get_string_value::(arry.as_ref(), index), + _ => String::new(), + } } fn materialize(&self, row_index: usize) -> Vec { @@ -113,35 +121,70 @@ where } } +/// Helper function to get a map value by key using the specified offset size +fn get_map_value<'a, OffsetSize: OffsetSizeTrait>( + keys: &'a dyn Array, + vals: &'a dyn Array, + start_offset: usize, + count: usize, + key: &str, +) -> Option<&'a str> { + let keys = keys.as_string::(); + let vals = vals.as_string::(); + for (idx, map_key) in keys.iter().enumerate().skip(start_offset).take(count) { + if let Some(map_key) = map_key { + if key == map_key { + return Some(vals.value(idx)); + } + } + } + None +} + +/// Helper function to materialize a map using the specified offset size +fn materialize_map( + keys: &dyn Array, + values: &dyn Array, +) -> HashMap { + let mut ret = HashMap::new(); + let keys = keys.as_string::(); + let values = values.as_string::(); + for (key, value) in keys.iter().zip(values.iter()) { + if let (Some(key), Some(value)) = (key, value) { + ret.insert(key.into(), value.into()); + } + } + ret +} + impl EngineMap for MapArray { fn get<'a>(&'a self, row_index: usize, key: &str) -> Option<&'a str> { let offsets = self.offsets(); let start_offset = offsets[row_index] as usize; let count = offsets[row_index + 1] as usize - start_offset; - let keys = self.keys().as_string::(); - for (idx, map_key) in keys.iter().enumerate().skip(start_offset).take(count) { - if let Some(map_key) = map_key { - if key == map_key { - // found the item - let vals = self.values().as_string::(); - return Some(vals.value(idx)); - } + + match (self.keys().data_type(), self.values().data_type()) { + (ArrowDataType::LargeUtf8, ArrowDataType::LargeUtf8) => { + get_map_value::(self.keys(), self.values(), start_offset, count, key) + } + (ArrowDataType::Utf8, ArrowDataType::Utf8) => { + get_map_value::(self.keys(), self.values(), start_offset, count, key) } + _ => None, } - None } fn materialize(&self, row_index: usize) -> HashMap { - let mut ret = HashMap::new(); let map_val = self.value(row_index); - let keys = map_val.column(0).as_string::(); - let values = map_val.column(1).as_string::(); - for (key, value) in keys.iter().zip(values.iter()) { - if let (Some(key), Some(value)) = (key, value) { - ret.insert(key.into(), value.into()); + match (map_val.column(0).data_type(), map_val.column(1).data_type()) { + (ArrowDataType::LargeUtf8, ArrowDataType::LargeUtf8) => { + materialize_map::(map_val.column(0), map_val.column(1)) } + (ArrowDataType::Utf8, ArrowDataType::Utf8) => { + materialize_map::(map_val.column(0), map_val.column(1)) + } + _ => HashMap::new(), } - ret } } @@ -277,19 +320,24 @@ impl ArrowEngineData { data_type: &DataType, col: &'a dyn Array, ) -> DeltaResult<&'a dyn GetData<'a>> { - use ArrowDataType::Utf8; + use ArrowDataType::{LargeUtf8, Utf8, Utf8View}; + + // Helper to check if a type is a string type (Utf8, LargeUtf8, or Utf8View) + let is_string_type = |dt: &ArrowDataType| matches!(dt, Utf8 | LargeUtf8 | Utf8View); + let col_as_list = || { if let Some(array) = col.as_list_opt::() { - (array.value_type() == Utf8).then_some(array as _) + is_string_type(&array.value_type()).then_some(array as _) } else if let Some(array) = col.as_list_opt::() { - (array.value_type() == Utf8).then_some(array as _) + is_string_type(&array.value_type()).then_some(array as _) } else { None } }; let col_as_map = || { col.as_map_opt().and_then(|array| { - (array.key_type() == &Utf8 && array.value_type() == &Utf8).then_some(array as _) + (is_string_type(array.key_type()) && is_string_type(array.value_type())) + .then_some(array as _) }) }; let result: Result<&'a dyn GetData<'a>, _> = match data_type { @@ -299,7 +347,12 @@ impl ArrowEngineData { } &DataType::STRING => { debug!("Pushing string array for {}", ColumnName::new(path)); - col.as_string_opt().map(|a| a as _).ok_or("string") + match col.data_type() { + ArrowDataType::LargeUtf8 => col.as_string_opt::().map(|a| a as _), + ArrowDataType::Utf8 => col.as_string_opt::().map(|a| a as _), + _ => None, + } + .ok_or("string") } &DataType::INTEGER => { debug!("Pushing int32 array for {}", ColumnName::new(path)); diff --git a/kernel/src/engine/arrow_expression/evaluate_expression.rs b/kernel/src/engine/arrow_expression/evaluate_expression.rs index bda9ea06f4..c507829937 100644 --- a/kernel/src/engine/arrow_expression/evaluate_expression.rs +++ b/kernel/src/engine/arrow_expression/evaluate_expression.rs @@ -1041,7 +1041,7 @@ mod tests { let result2 = coalesce_arrays(&[arr1, arr2], Some(&DataType::STRING)); assert_result_error_with_message( result2, - "Requested result type Utf8 does not match arrays' data type Int32", + "Requested result type LargeUtf8 does not match arrays' data type Int32", ); } diff --git a/kernel/src/engine/arrow_expression/mod.rs b/kernel/src/engine/arrow_expression/mod.rs index 556347dd9c..07a1735216 100644 --- a/kernel/src/engine/arrow_expression/mod.rs +++ b/kernel/src/engine/arrow_expression/mod.rs @@ -29,6 +29,41 @@ mod tests; // TODO leverage scalars / Datum +/// Helper function to append string values or nulls to either StringBuilder or LargeStringBuilder. +/// This works generically with both Utf8 (i32 offsets) and LargeUtf8 (i64 offsets) builders. +fn append_string_to_builder( + builder: &mut dyn ArrayBuilder, + value: Option<&str>, + num_rows: usize, +) -> DeltaResult<()> { + // Try StringBuilder (Utf8 with i32 offsets) + if let Some(sb) = builder.as_any_mut().downcast_mut::() { + for _ in 0..num_rows { + match value { + Some(v) => sb.append_value(v), + None => sb.append_null(), + } + } + return Ok(()); + } + + // Try LargeStringBuilder (LargeUtf8 with i64 offsets) + if let Some(lsb) = builder + .as_any_mut() + .downcast_mut::() + { + for _ in 0..num_rows { + match value { + Some(v) => lsb.append_value(v), + None => lsb.append_null(), + } + } + return Ok(()); + } + + Err(Error::invalid_expression("Invalid builder type for string")) +} + impl Scalar { /// Convert scalar to arrow array. pub fn to_array(&self, num_rows: usize) -> DeltaResult { @@ -86,7 +121,7 @@ impl Scalar { Byte(val) => append_val_as!(array::Int8Builder, *val), Float(val) => append_val_as!(array::Float32Builder, *val), Double(val) => append_val_as!(array::Float64Builder, *val), - String(val) => append_val_as!(array::StringBuilder, val), + String(val) => append_string_to_builder(builder, Some(val), num_rows)?, Boolean(val) => append_val_as!(array::BooleanBuilder, *val), Timestamp(val) | TimestampNtz(val) => { // timezone was already set at builder construction time @@ -167,7 +202,7 @@ impl Scalar { DataType::BYTE => append_null_as!(array::Int8Builder), DataType::FLOAT => append_null_as!(array::Float32Builder), DataType::DOUBLE => append_null_as!(array::Float64Builder), - DataType::STRING => append_null_as!(array::StringBuilder), + DataType::STRING => append_string_to_builder(builder, None, num_rows)?, DataType::BOOLEAN => append_null_as!(array::BooleanBuilder), DataType::TIMESTAMP | DataType::TIMESTAMP_NTZ => { append_null_as!(array::TimestampMicrosecondBuilder) @@ -310,8 +345,11 @@ impl ExpressionEvaluator for DefaultExpressionEvaluator { (expr, output_type) => { let array_ref = evaluate_expression(expr, batch, Some(output_type))?; let array_ref = apply_schema_to(&array_ref, output_type)?; - let arrow_type = ArrowDataType::try_from_kernel(output_type)?; - let schema = ArrowSchema::new(vec![ArrowField::new("output", arrow_type, true)]); + // Use the actual data type of the array, not the converted kernel type + // This allows both Utf8 and LargeUtf8 to work correctly + let actual_arrow_type = array_ref.data_type().clone(); + let schema = + ArrowSchema::new(vec![ArrowField::new("output", actual_arrow_type, true)]); RecordBatch::try_new(Arc::new(schema), vec![array_ref])? } }; diff --git a/kernel/src/engine/arrow_expression/tests.rs b/kernel/src/engine/arrow_expression/tests.rs index e2a5aea24c..069d3453af 100644 --- a/kernel/src/engine/arrow_expression/tests.rs +++ b/kernel/src/engine/arrow_expression/tests.rs @@ -2,7 +2,7 @@ use std::ops::{Add, Div, Mul, Sub}; use crate::arrow::array::{ create_array, Array, ArrayRef, BooleanArray, GenericStringArray, Int32Array, Int32Builder, - ListArray, MapArray, MapBuilder, MapFieldNames, StringArray, StringBuilder, StructArray, + LargeStringBuilder, ListArray, MapArray, MapBuilder, MapFieldNames, StringArray, StructArray, }; use crate::arrow::buffer::{BooleanBuffer, NullBuffer, OffsetBuffer, ScalarBuffer}; use crate::arrow::compute::kernels::cmp::{gt_eq, lt}; @@ -231,7 +231,7 @@ fn test_literal_complex_type_array() { let expected_values = [Some(1), Some(2), None, Some(3)]; let expected_keys = (0..10).flat_map(|_| expected_keys.iter().cloned()); let expected_values = (0..10).flat_map(|_| expected_values.iter().cloned()); - let map_keys = map_array.keys().as_string::(); + let map_keys = map_array.keys().as_string::(); assert!(expected_keys.zip(map_keys).all(|(a, b)| a == b.unwrap())); let map_values = map_array .values() @@ -266,11 +266,11 @@ fn test_invalid_array_sides() { #[test] fn test_str_arrays() { - let values = GenericStringArray::::from(vec![ + let values = GenericStringArray::::from(vec![ "hi", "bye", "hi", "hi", "bye", "bye", "hi", "bye", "hi", ]); let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 6, 9])); - let field = Arc::new(Field::new("item", DataType::Utf8, true)); + let field = Arc::new(Field::new("item", DataType::LargeUtf8, true)); let arr_field = Arc::new(Field::new("item", DataType::List(field.clone()), true)); let schema = Schema::new([arr_field.clone()]); let array = ListArray::new(field.clone(), offsets, Arc::new(values), None); @@ -664,12 +664,12 @@ fn test_null_row() { Arc::new(StructArray::new_null( [ Arc::new(Field::new("a", DataType::Int32, true)), - Arc::new(Field::new("b", DataType::Utf8, false)), + Arc::new(Field::new("b", DataType::LargeUtf8, false)), ] .into(), 1, )), - create_array!(Utf8, [None::]), + create_array!(LargeUtf8, [None::]), ], ) .unwrap(); @@ -723,7 +723,7 @@ fn test_create_one() { let expected_schema = Arc::new(Schema::new(vec![ Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Utf8, true), + Field::new("b", DataType::LargeUtf8, true), Field::new("c", DataType::Int32, false), Field::new("d", DataType::Int32, true), ])); @@ -731,7 +731,7 @@ fn test_create_one() { expected_schema, vec![ create_array!(Int32, [1]), - create_array!(Utf8, ["B"]), + create_array!(LargeUtf8, ["B"]), create_array!(Int32, [3]), create_array!(Int32, [None]), ], @@ -879,12 +879,12 @@ fn test_scalar_map() -> DeltaResult<()> { let arrow_array = scalar_map.to_array(2)?; let map_array = arrow_array.as_any().downcast_ref::().unwrap(); - let key_builder = StringBuilder::new(); + let key_builder = LargeStringBuilder::new(); let val_builder = Int32Builder::new(); let names = MapFieldNames { - entry: "key_values".to_string(), - key: "keys".to_string(), - value: "values".to_string(), + entry: "key_value".to_string(), + key: "key".to_string(), + value: "value".to_string(), }; let mut builder = MapBuilder::new(Some(names), key_builder, val_builder); builder.keys().append_value("key1"); diff --git a/kernel/src/engine/arrow_get_data.rs b/kernel/src/engine/arrow_get_data.rs index fbed64df10..06c195c6f1 100644 --- a/kernel/src/engine/arrow_get_data.rs +++ b/kernel/src/engine/arrow_get_data.rs @@ -1,7 +1,7 @@ use crate::arrow::array::{ - types::{GenericStringType, Int32Type, Int64Type}, - Array, BooleanArray, GenericByteArray, GenericListArray, MapArray, OffsetSizeTrait, - PrimitiveArray, + types::{Int32Type, Int64Type}, + Array, BooleanArray, GenericListArray, LargeStringArray, MapArray, OffsetSizeTrait, + PrimitiveArray, StringArray, }; use crate::{ @@ -41,7 +41,17 @@ impl GetData<'_> for PrimitiveArray { } } -impl<'a> GetData<'a> for GenericByteArray> { +impl<'a> GetData<'a> for StringArray { + fn get_str(&'a self, row_index: usize, _field_name: &str) -> DeltaResult> { + if self.is_valid(row_index) { + Ok(Some(self.value(row_index))) + } else { + Ok(None) + } + } +} + +impl<'a> GetData<'a> for LargeStringArray { fn get_str(&'a self, row_index: usize, _field_name: &str) -> DeltaResult> { if self.is_valid(row_index) { Ok(Some(self.value(row_index))) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 21cb246f37..6648cee88f 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -18,7 +18,8 @@ use crate::{ use crate::arrow::array::{ cast::AsArray, make_array, new_null_array, Array as ArrowArray, BooleanArray, GenericListArray, - MapArray, OffsetSizeTrait, PrimitiveArray, RecordBatch, StringArray, StructArray, + GenericStringArray, LargeStringArray, MapArray, OffsetSizeTrait, PrimitiveArray, RecordBatch, + StringArray, StructArray, }; use crate::arrow::buffer::NullBuffer; use crate::arrow::compute::concat_batches; @@ -1012,16 +1013,24 @@ pub(crate) fn parse_json( schema: SchemaRef, ) -> DeltaResult> { let json_strings: RecordBatch = ArrowEngineData::try_from_engine_data(json_strings)?.into(); - let json_strings = json_strings - .column(0) - .as_any() - .downcast_ref::() - .ok_or_else(|| { - Error::generic("Expected json_strings to be a StringArray, found something else") - })?; + let array_ref = json_strings.column(0); let schema = Arc::new(ArrowSchema::try_from_kernel(schema.as_ref())?); - let result = parse_json_impl(json_strings, schema)?; - Ok(Box::new(ArrowEngineData::new(result))) + + // Try LargeStringArray first + if let Some(large_strings) = array_ref.as_any().downcast_ref::() { + let result = parse_json_impl(large_strings, schema)?; + return Ok(Box::new(ArrowEngineData::new(result))); + } + + // Fall back to StringArray + if let Some(strings) = array_ref.as_any().downcast_ref::() { + let result = parse_json_impl(strings, schema)?; + return Ok(Box::new(ArrowEngineData::new(result))); + } + + Err(Error::generic( + "Expected json_strings to be a StringArray or LargeStringArray, found something else", + )) } // Raw arrow implementation of the json parsing. Separate from the public function for testing. @@ -1029,7 +1038,10 @@ pub(crate) fn parse_json( // NOTE: This code is really inefficient because arrow lacks the native capability to perform robust // StringArray -> StructArray JSON parsing. See https://github.com/apache/arrow-rs/issues/6522. If // that shortcoming gets fixed upstream, this method can simplify or hopefully even disappear. -fn parse_json_impl(json_strings: &StringArray, schema: ArrowSchemaRef) -> DeltaResult { +fn parse_json_impl( + json_strings: &GenericStringArray, + schema: ArrowSchemaRef, +) -> DeltaResult { if json_strings.is_empty() { return Ok(RecordBatch::new_empty(schema)); } @@ -1231,45 +1243,45 @@ mod tests { ArrowField::new("c", ArrowDataType::Int32, true), ])); let input: Vec<&str> = vec![]; - let result = parse_json_impl(&input.into(), requested_schema.clone()).unwrap(); + let result = parse_json_impl::(&input.into(), requested_schema.clone()).unwrap(); assert_eq!(result.num_rows(), 0); let input: Vec> = vec![Some("")]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); + let result = parse_json_impl::(&input.into(), requested_schema.clone()); result.expect_err("empty string"); let input: Vec> = vec![Some(" \n\t")]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); + let result = parse_json_impl::(&input.into(), requested_schema.clone()); result.expect_err("empty string"); let input: Vec> = vec![Some(r#""a""#)]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); + let result = parse_json_impl::(&input.into(), requested_schema.clone()); result.expect_err("invalid string"); let input: Vec> = vec![Some(r#"{ "a": 1"#)]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); + let result = parse_json_impl::(&input.into(), requested_schema.clone()); result.expect_err("incomplete object"); let input: Vec> = vec![Some("{}{}")]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); + let result = parse_json_impl::(&input.into(), requested_schema.clone()); assert!(matches!( result.unwrap_err(), Error::Generic(s) if s == "Malformed JSON: Multiple JSON objects" )); let input: Vec> = vec![Some(r#"{} { "a": 1"#)]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); + let result = parse_json_impl::(&input.into(), requested_schema.clone()); assert!(matches!( result.unwrap_err(), Error::Generic(s) if s == "Malformed JSON: Multiple JSON objects" )); let input: Vec> = vec![Some(r#"{ "a": 1"#), Some(r#", "b"}"#)]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); + let result = parse_json_impl::(&input.into(), requested_schema.clone()); result.expect_err("split object"); let input: Vec> = vec![None, Some(r#"{"a": 1, "b": "2", "c": 3}"#), None]; - let result = parse_json_impl(&input.into(), requested_schema.clone()).unwrap(); + let result = parse_json_impl::(&input.into(), requested_schema.clone()).unwrap(); assert_eq!(result.num_rows(), 3); assert_eq!(result.column(0).null_count(), 2); assert_eq!(result.column(1).null_count(), 2); @@ -1288,7 +1300,7 @@ mod tests { let json_string = format!(r#"{{"long_val": "{long_string}"}}"#); let input: Vec> = vec![Some(&json_string)]; - let batch = parse_json_impl(&input.into(), schema.clone()).unwrap(); + let batch = parse_json_impl::(&input.into(), schema.clone()).unwrap(); assert_eq!(batch.num_rows(), 1); let long_col = batch.column(0).as_string::(); assert_eq!(long_col.value(0), long_string); @@ -1485,13 +1497,13 @@ mod tests { let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(0, mode), ArrowDataType::Int32, false) .with_metadata(arrow_fid(0)), - ArrowField::new(parquet_name(1, mode), ArrowDataType::Utf8, true) + ArrowField::new(parquet_name(1, mode), ArrowDataType::LargeUtf8, true) .with_metadata(arrow_fid(1)), ])); let res = get_requested_indices(&requested_schema, &parquet_schema); assert_result_error_with_message( res, - "Invalid argument error: Incorrect datatype. Expected integer, got Utf8", + "Invalid argument error: Incorrect datatype. Expected integer, got LargeUtf8", ); let requested_schema = StructType::new_unchecked([ @@ -1509,7 +1521,7 @@ mod tests { let res = get_requested_indices(&requested_schema, &parquet_schema); assert_result_error_with_message( res, - "Invalid argument error: Incorrect datatype. Expected Utf8, got Int32", + "Invalid argument error: Incorrect datatype. Expected LargeUtf8, got Int32", ); }) } @@ -1610,7 +1622,7 @@ mod tests { ReorderIndex::missing( 1, Arc::new( - ArrowField::new(parquet_name(1, mode), ArrowDataType::Utf8, true) + ArrowField::new(parquet_name(1, mode), ArrowDataType::LargeUtf8, true) .with_metadata(expected_arrow_metadata), ), ), @@ -1649,7 +1661,7 @@ mod tests { ReorderIndex::missing( 1, Arc::new( - ArrowField::new("s_physical", ArrowDataType::Utf8, true) + ArrowField::new("s_physical", ArrowDataType::LargeUtf8, true) .with_metadata(expected_arrow_metadata), ), ), @@ -1687,7 +1699,7 @@ mod tests { ReorderIndex::missing( 1, Arc::new( - ArrowField::new("s_physical", ArrowDataType::Utf8, true) + ArrowField::new("s_physical", ArrowDataType::LargeUtf8, true) .with_metadata(expected_arrow_metadata), ), ), @@ -2741,9 +2753,10 @@ mod tests { let mut fields = requested_schema.fields(); let metadata1 = fields.next().unwrap().metadata_with_string_values(); let metadata2 = fields.next().unwrap().metadata_with_string_values(); - let expected_field1 = ArrowField::new(parquet_name(1, mode), ArrowDataType::Utf8, true) - .with_metadata(metadata1) - .into(); + let expected_field1 = + ArrowField::new(parquet_name(1, mode), ArrowDataType::LargeUtf8, true) + .with_metadata(metadata1) + .into(); let expected_field2 = ArrowField::new(parquet_name(2, mode), ArrowDataType::Int32, true) .with_metadata(metadata2) diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index ac63877ec8..7ce536957b 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -4,8 +4,8 @@ use std::collections::HashMap; use std::ops::Range; use std::sync::Arc; -use crate::arrow::array::builder::{MapBuilder, MapFieldNames, StringBuilder}; -use crate::arrow::array::{Int64Array, RecordBatch, StringArray, StructArray}; +use crate::arrow::array::builder::{LargeStringBuilder, MapBuilder, MapFieldNames}; +use crate::arrow::array::{Int64Array, LargeStringArray, RecordBatch, StructArray}; use crate::arrow::datatypes::{DataType, Field}; use crate::parquet::arrow::arrow_reader::{ ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, @@ -77,9 +77,9 @@ impl DataFileMetadata { num_records, } = self; // create the record batch of the write metadata - let path = Arc::new(StringArray::from(vec![location.to_string()])); - let key_builder = StringBuilder::new(); - let val_builder = StringBuilder::new(); + let path = Arc::new(LargeStringArray::from(vec![location.to_string()])); + let key_builder = LargeStringBuilder::new(); + let val_builder = LargeStringBuilder::new(); let names = MapFieldNames { entry: "key_value".to_string(), key: "key".to_string(), @@ -517,8 +517,8 @@ mod tests { key: "key".to_string(), value: "value".to_string(), }), - StringBuilder::new(), - StringBuilder::new(), + LargeStringBuilder::new(), + LargeStringBuilder::new(), ); partition_values_builder.keys().append_value("partition1"); partition_values_builder.values().append_value("a"); @@ -535,7 +535,7 @@ mod tests { let expected = RecordBatch::try_new( schema, vec![ - Arc::new(StringArray::from(vec![location.to_string()])), + Arc::new(LargeStringArray::from(vec![location.to_string()])), Arc::new(partition_values), Arc::new(Int64Array::from(vec![size as i64])), Arc::new(Int64Array::from(vec![last_modified])), diff --git a/kernel/src/engine/ensure_data_types.rs b/kernel/src/engine/ensure_data_types.rs index 2d5a660c36..8f9cd5be29 100644 --- a/kernel/src/engine/ensure_data_types.rs +++ b/kernel/src/engine/ensure_data_types.rs @@ -437,7 +437,7 @@ mod tests { &ArrowDataType::new_list(ArrowDataType::Int64, true), false, ), - "Invalid argument error: Incorrect datatype. Expected Utf8, got Int64", + "Invalid argument error: Incorrect datatype. Expected LargeUtf8, got Int64", ); assert_result_error_with_message( ensure_data_types(