Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions acceptance/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,22 @@ fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {

// some things are equivalent, but don't show up as equivalent for `==`, so we normalize here
fn normalize_col(col: Arc<dyn Array>) -> Arc<dyn Array> {
if let DataType::Timestamp(unit, Some(zone)) = col.data_type() {
if **zone == *"+00:00" {
let data_type = DataType::Timestamp(*unit, Some("UTC".into()));
delta_kernel::arrow::compute::cast(&col, &data_type).expect("Could not cast to UTC")
} else {
col
match col.data_type() {
DataType::Timestamp(unit, Some(zone)) => {
if **zone == *"+00:00" {
let data_type = DataType::Timestamp(*unit, Some("UTC".into()));
delta_kernel::arrow::compute::cast(&col, &data_type).expect("Could not cast to UTC")
} else {
col
}
}
DataType::Utf8 => {
// just make everything LargeUtf8
let data_type = DataType::LargeUtf8;
delta_kernel::arrow::compute::cast(&col, &data_type)
.expect("Could not cast to large utf8")
}
} else {
col
_ => col,
}
}

Expand Down
36 changes: 19 additions & 17 deletions kernel/src/actions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1003,8 +1003,8 @@ mod tests {
use super::*;
use crate::{
arrow::array::{
Array, BooleanArray, Int32Array, Int64Array, ListArray, ListBuilder, MapBuilder,
MapFieldNames, RecordBatch, StringArray, StringBuilder, StructArray,
Array, BooleanArray, Int32Array, Int64Array, LargeStringArray, LargeStringBuilder,
ListArray, ListBuilder, MapBuilder, MapFieldNames, RecordBatch, StructArray,
},
arrow::datatypes::{DataType as ArrowDataType, Field, Schema},
arrow::json::ReaderBuilder,
Expand Down Expand Up @@ -1044,19 +1044,19 @@ mod tests {

fn create_string_map_builder(
nullable_values: bool,
) -> MapBuilder<StringBuilder, StringBuilder> {
) -> MapBuilder<LargeStringBuilder, LargeStringBuilder> {
MapBuilder::new(
Some(MapFieldNames {
entry: "key_value".to_string(),
key: "key".to_string(),
value: "value".to_string(),
}),
StringBuilder::new(),
StringBuilder::new(),
LargeStringBuilder::new(),
LargeStringBuilder::new(),
)
.with_values_field(Field::new(
"value".to_string(),
ArrowDataType::Utf8,
ArrowDataType::LargeUtf8,
nullable_values,
))
}
Expand Down Expand Up @@ -1527,15 +1527,15 @@ mod tests {
.into();

let schema = Arc::new(Schema::new(vec![
Field::new("appId", ArrowDataType::Utf8, false),
Field::new("appId", ArrowDataType::LargeUtf8, false),
Field::new("version", ArrowDataType::Int64, false),
Field::new("lastUpdated", ArrowDataType::Int64, true),
]));

let expected = RecordBatch::try_new(
schema,
vec![
Arc::new(StringArray::from(vec!["app_id"])),
Arc::new(LargeStringArray::from(vec!["app_id"])),
Arc::new(Int64Array::from(vec![0_i64])),
Arc::new(Int64Array::from(vec![None::<i64>])),
],
Expand Down Expand Up @@ -1570,11 +1570,13 @@ mod tests {
vec![
Arc::new(Int64Array::from(vec![Some(0)])),
Arc::new(Int64Array::from(vec![None::<i64>])),
Arc::new(StringArray::from(vec![Some("UNKNOWN")])),
Arc::new(LargeStringArray::from(vec![Some("UNKNOWN")])),
operation_parameters,
Arc::new(StringArray::from(vec![Some(format!("v{KERNEL_VERSION}"))])),
Arc::new(StringArray::from(vec![None::<String>])),
Arc::new(StringArray::from(vec![commit_info_txn_id])),
Arc::new(LargeStringArray::from(vec![Some(format!(
"v{KERNEL_VERSION}"
))])),
Arc::new(LargeStringArray::from(vec![None::<String>])),
Arc::new(LargeStringArray::from(vec![commit_info_txn_id])),
],
)
.unwrap();
Expand Down Expand Up @@ -1605,8 +1607,8 @@ mod tests {
let expected = RecordBatch::try_new(
record_batch.schema(),
vec![
Arc::new(StringArray::from(vec!["my.domain"])),
Arc::new(StringArray::from(vec!["config_value"])),
Arc::new(LargeStringArray::from(vec!["my.domain"])),
Arc::new(LargeStringArray::from(vec!["config_value"])),
Arc::new(BooleanArray::from(vec![false])),
],
)
Expand Down Expand Up @@ -1859,7 +1861,7 @@ mod tests {
.unwrap()
.into();

let list_field = Arc::new(Field::new("element", ArrowDataType::Utf8, false));
let list_field = Arc::new(Field::new("element", ArrowDataType::LargeUtf8, false));
let protocol_fields = vec![
Field::new("minReaderVersion", ArrowDataType::Int32, false),
Field::new("minWriterVersion", ArrowDataType::Int32, false),
Expand All @@ -1876,13 +1878,13 @@ mod tests {
];
let schema = Arc::new(Schema::new(protocol_fields.clone()));

let string_builder = StringBuilder::new();
let string_builder = LargeStringBuilder::new();
let mut list_builder = ListBuilder::new(string_builder).with_field(list_field.clone());
list_builder.values().append_value("columnMapping");
list_builder.append(true);
let reader_features_array = list_builder.finish();

let string_builder = StringBuilder::new();
let string_builder = LargeStringBuilder::new();
let mut list_builder = ListBuilder::new(string_builder).with_field(list_field.clone());
list_builder.values().append_value("deletionVectors");
list_builder.append(true);
Expand Down
2 changes: 1 addition & 1 deletion kernel/src/engine/arrow_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ impl TryFromKernel<&DataType> for ArrowDataType {
match t {
DataType::Primitive(p) => {
match p {
PrimitiveType::String => Ok(ArrowDataType::Utf8),
PrimitiveType::String => Ok(ArrowDataType::LargeUtf8),
PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type
PrimitiveType::Integer => Ok(ArrowDataType::Int32),
PrimitiveType::Short => Ok(ArrowDataType::Int16),
Expand Down
99 changes: 76 additions & 23 deletions kernel/src/engine/arrow_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ impl From<Box<ArrowEngineData>> for RecordBatch {
}
}

/// Helper function to get a string value from an array using the specified offset size
fn get_string_value<OffsetSize: OffsetSizeTrait>(arry: &dyn Array, index: usize) -> String {
arry.as_string::<OffsetSize>().value(index).to_string()
}

impl<OffsetSize> EngineList for GenericListArray<OffsetSize>
where
OffsetSize: OffsetSizeTrait,
Expand All @@ -100,8 +105,11 @@ where

fn get(&self, row_index: usize, index: usize) -> String {
let arry = self.value(row_index);
let sarry = arry.as_string::<i32>();
sarry.value(index).to_string()
match arry.data_type() {
ArrowDataType::LargeUtf8 => get_string_value::<i64>(arry.as_ref(), index),
ArrowDataType::Utf8 => get_string_value::<i32>(arry.as_ref(), index),
_ => String::new(),
}
}

fn materialize(&self, row_index: usize) -> Vec<String> {
Expand All @@ -113,35 +121,70 @@ where
}
}

/// Helper function to get a map value by key using the specified offset size
fn get_map_value<'a, OffsetSize: OffsetSizeTrait>(
keys: &'a dyn Array,
vals: &'a dyn Array,
start_offset: usize,
count: usize,
key: &str,
) -> Option<&'a str> {
let keys = keys.as_string::<OffsetSize>();
let vals = vals.as_string::<OffsetSize>();
for (idx, map_key) in keys.iter().enumerate().skip(start_offset).take(count) {
if let Some(map_key) = map_key {
if key == map_key {
return Some(vals.value(idx));
}
}
}
None
}

/// Helper function to materialize a map using the specified offset size
fn materialize_map<OffsetSize: OffsetSizeTrait>(
keys: &dyn Array,
values: &dyn Array,
) -> HashMap<String, String> {
let mut ret = HashMap::new();
let keys = keys.as_string::<OffsetSize>();
let values = values.as_string::<OffsetSize>();
for (key, value) in keys.iter().zip(values.iter()) {
if let (Some(key), Some(value)) = (key, value) {
ret.insert(key.into(), value.into());
}
}
ret
}

impl EngineMap for MapArray {
fn get<'a>(&'a self, row_index: usize, key: &str) -> Option<&'a str> {
let offsets = self.offsets();
let start_offset = offsets[row_index] as usize;
let count = offsets[row_index + 1] as usize - start_offset;
let keys = self.keys().as_string::<i32>();
for (idx, map_key) in keys.iter().enumerate().skip(start_offset).take(count) {
if let Some(map_key) = map_key {
if key == map_key {
// found the item
let vals = self.values().as_string::<i32>();
return Some(vals.value(idx));
}

match (self.keys().data_type(), self.values().data_type()) {
(ArrowDataType::LargeUtf8, ArrowDataType::LargeUtf8) => {
get_map_value::<i64>(self.keys(), self.values(), start_offset, count, key)
}
(ArrowDataType::Utf8, ArrowDataType::Utf8) => {
get_map_value::<i32>(self.keys(), self.values(), start_offset, count, key)
}
_ => None,
}
None
}

fn materialize(&self, row_index: usize) -> HashMap<String, String> {
let mut ret = HashMap::new();
let map_val = self.value(row_index);
let keys = map_val.column(0).as_string::<i32>();
let values = map_val.column(1).as_string::<i32>();
for (key, value) in keys.iter().zip(values.iter()) {
if let (Some(key), Some(value)) = (key, value) {
ret.insert(key.into(), value.into());
match (map_val.column(0).data_type(), map_val.column(1).data_type()) {
(ArrowDataType::LargeUtf8, ArrowDataType::LargeUtf8) => {
materialize_map::<i64>(map_val.column(0), map_val.column(1))
}
(ArrowDataType::Utf8, ArrowDataType::Utf8) => {
materialize_map::<i32>(map_val.column(0), map_val.column(1))
}
_ => HashMap::new(),
}
ret
}
}

Expand Down Expand Up @@ -277,19 +320,24 @@ impl ArrowEngineData {
data_type: &DataType,
col: &'a dyn Array,
) -> DeltaResult<&'a dyn GetData<'a>> {
use ArrowDataType::Utf8;
use ArrowDataType::{LargeUtf8, Utf8, Utf8View};

// Helper to check if a type is a string type (Utf8, LargeUtf8, or Utf8View)
let is_string_type = |dt: &ArrowDataType| matches!(dt, Utf8 | LargeUtf8 | Utf8View);

let col_as_list = || {
if let Some(array) = col.as_list_opt::<i32>() {
(array.value_type() == Utf8).then_some(array as _)
is_string_type(&array.value_type()).then_some(array as _)
} else if let Some(array) = col.as_list_opt::<i64>() {
(array.value_type() == Utf8).then_some(array as _)
is_string_type(&array.value_type()).then_some(array as _)
} else {
None
}
};
let col_as_map = || {
col.as_map_opt().and_then(|array| {
(array.key_type() == &Utf8 && array.value_type() == &Utf8).then_some(array as _)
(is_string_type(array.key_type()) && is_string_type(array.value_type()))
.then_some(array as _)
})
};
let result: Result<&'a dyn GetData<'a>, _> = match data_type {
Expand All @@ -299,7 +347,12 @@ impl ArrowEngineData {
}
&DataType::STRING => {
debug!("Pushing string array for {}", ColumnName::new(path));
col.as_string_opt().map(|a| a as _).ok_or("string")
match col.data_type() {
ArrowDataType::LargeUtf8 => col.as_string_opt::<i64>().map(|a| a as _),
ArrowDataType::Utf8 => col.as_string_opt::<i32>().map(|a| a as _),
_ => None,
}
.ok_or("string")
}
&DataType::INTEGER => {
debug!("Pushing int32 array for {}", ColumnName::new(path));
Expand Down
2 changes: 1 addition & 1 deletion kernel/src/engine/arrow_expression/evaluate_expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1041,7 +1041,7 @@ mod tests {
let result2 = coalesce_arrays(&[arr1, arr2], Some(&DataType::STRING));
assert_result_error_with_message(
result2,
"Requested result type Utf8 does not match arrays' data type Int32",
"Requested result type LargeUtf8 does not match arrays' data type Int32",
);
}

Expand Down
46 changes: 42 additions & 4 deletions kernel/src/engine/arrow_expression/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,41 @@ mod tests;

// TODO leverage scalars / Datum

/// Helper function to append string values or nulls to either StringBuilder or LargeStringBuilder.
/// This works generically with both Utf8 (i32 offsets) and LargeUtf8 (i64 offsets) builders.
fn append_string_to_builder(
builder: &mut dyn ArrayBuilder,
value: Option<&str>,
num_rows: usize,
) -> DeltaResult<()> {
// Try StringBuilder (Utf8 with i32 offsets)
if let Some(sb) = builder.as_any_mut().downcast_mut::<array::StringBuilder>() {
for _ in 0..num_rows {
match value {
Some(v) => sb.append_value(v),
None => sb.append_null(),
}
}
return Ok(());
}

// Try LargeStringBuilder (LargeUtf8 with i64 offsets)
if let Some(lsb) = builder
.as_any_mut()
.downcast_mut::<array::LargeStringBuilder>()
{
for _ in 0..num_rows {
match value {
Some(v) => lsb.append_value(v),
None => lsb.append_null(),
}
}
return Ok(());
}

Err(Error::invalid_expression("Invalid builder type for string"))
}

impl Scalar {
/// Convert scalar to arrow array.
pub fn to_array(&self, num_rows: usize) -> DeltaResult<ArrayRef> {
Expand Down Expand Up @@ -86,7 +121,7 @@ impl Scalar {
Byte(val) => append_val_as!(array::Int8Builder, *val),
Float(val) => append_val_as!(array::Float32Builder, *val),
Double(val) => append_val_as!(array::Float64Builder, *val),
String(val) => append_val_as!(array::StringBuilder, val),
String(val) => append_string_to_builder(builder, Some(val), num_rows)?,
Boolean(val) => append_val_as!(array::BooleanBuilder, *val),
Timestamp(val) | TimestampNtz(val) => {
// timezone was already set at builder construction time
Expand Down Expand Up @@ -167,7 +202,7 @@ impl Scalar {
DataType::BYTE => append_null_as!(array::Int8Builder),
DataType::FLOAT => append_null_as!(array::Float32Builder),
DataType::DOUBLE => append_null_as!(array::Float64Builder),
DataType::STRING => append_null_as!(array::StringBuilder),
DataType::STRING => append_string_to_builder(builder, None, num_rows)?,
DataType::BOOLEAN => append_null_as!(array::BooleanBuilder),
DataType::TIMESTAMP | DataType::TIMESTAMP_NTZ => {
append_null_as!(array::TimestampMicrosecondBuilder)
Expand Down Expand Up @@ -310,8 +345,11 @@ impl ExpressionEvaluator for DefaultExpressionEvaluator {
(expr, output_type) => {
let array_ref = evaluate_expression(expr, batch, Some(output_type))?;
let array_ref = apply_schema_to(&array_ref, output_type)?;
let arrow_type = ArrowDataType::try_from_kernel(output_type)?;
let schema = ArrowSchema::new(vec![ArrowField::new("output", arrow_type, true)]);
// Use the actual data type of the array, not the converted kernel type
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when does using output_type cause an issue? I feel like we should not do this, because we'll get unexpected behavior where the output doesn't actually match what we asked for

// This allows both Utf8 and LargeUtf8 to work correctly
let actual_arrow_type = array_ref.data_type().clone();
let schema =
ArrowSchema::new(vec![ArrowField::new("output", actual_arrow_type, true)]);
RecordBatch::try_new(Arc::new(schema), vec![array_ref])?
}
};
Expand Down
Loading
Loading