From d70732243cf5c5957124c361fa7295562b0466b1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 28 Jan 2025 08:32:43 -0500 Subject: [PATCH 1/4] Add tests for `concat`ing sliced list arrays --- arrow-select/src/concat.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 4855e0087cc6..26bc258fc716 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -462,6 +462,34 @@ mod tests { assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); } + + #[test] + fn test_concat_primitive_list_arrays_slices() { + let list1 = vec![ + Some(vec![Some(-1), Some(-1), Some(2), None, None]), + Some(vec![]), // In slice + None, // In slice + Some(vec![Some(10)]), + ]; + let list1_array = ListArray::from_iter_primitive::(list1.clone()); + let list1_array = list1_array.slice(1, 2); + let list1_values = list1.into_iter().skip(1).take(2); + + let list2 = vec![ + None, + Some(vec![Some(100), None, Some(101)]), + Some(vec![Some(102)]), + ]; + let list2_array = ListArray::from_iter_primitive::(list2.clone()); + + let array_result = concat(&[&list1_array, &list2_array]).unwrap(); + + let expected = list1_values.chain(list2); + let array_expected = ListArray::from_iter_primitive::(expected); + + assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); + } + #[test] fn test_concat_primitive_fixed_size_list_arrays() { let list1 = vec![ From 6c6c9926a880011df174d94f6fe1d1cffaa445b8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 28 Jan 2025 09:02:52 -0500 Subject: [PATCH 2/4] fix sliced List arrays --- arrow-select/src/concat.rs | 56 +++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 26bc258fc716..1a4e81777b52 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -135,6 +135,7 @@ fn concat_lists( ) -> Result { let mut output_len = 0; let mut list_has_nulls = false; + let mut list_has_slices = false; let lists = arrays .iter() @@ -142,6 +143,8 @@ fn concat_lists( .inspect(|l| { output_len += l.len(); list_has_nulls |= l.null_count() != 0; + list_has_slices |= l.offsets()[0].as_usize() > 0 + || l.offsets().last().unwrap().as_usize() < l.values().len(); }) .collect::>(); @@ -156,10 +159,23 @@ fn concat_lists( NullBuffer::new(nulls.finish()) }); - let values: Vec<&dyn Array> = lists - .iter() - .map(|x| x.values().as_ref()) - .collect::>(); + // If any of the lists have slices, we need to slice the values + // to ensure that the offsets are correct + let mut sliced_values; + let values: Vec<&dyn Array> = if list_has_slices { + sliced_values = Vec::with_capacity(lists.len()); + for l in &lists { + // if the first offset is non-zero, we need to slice the values so when + // we concatenate them below only the relevant values are included + let offsets = l.offsets(); + let start_offset = offsets[0].as_usize(); + let end_offset = offsets.last().unwrap().as_usize(); + sliced_values.push(l.values().slice(start_offset, end_offset - start_offset)); + } + sliced_values.iter().map(|a| a.as_ref()).collect() + } else { + lists.iter().map(|x| x.values().as_ref()).collect() + }; let concatenated_values = concat(values.as_slice())?; @@ -462,13 +478,12 @@ mod tests { assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); } - #[test] fn test_concat_primitive_list_arrays_slices() { let list1 = vec![ Some(vec![Some(-1), Some(-1), Some(2), None, None]), - Some(vec![]), // In slice - None, // In slice + Some(vec![]), // In slice + None, // In slice Some(vec![Some(10)]), ]; let list1_array = ListArray::from_iter_primitive::(list1.clone()); @@ -490,6 +505,33 @@ mod tests { assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); } + #[test] + fn test_concat_primitive_list_arrays_sliced_lengths() { + let list1 = vec![ + Some(vec![Some(-1), Some(-1), Some(2), None, None]), // In slice + Some(vec![]), // In slice + None, // In slice + Some(vec![Some(10)]), + ]; + let list1_array = ListArray::from_iter_primitive::(list1.clone()); + let list1_array = list1_array.slice(0, 3); // no offset, but not all values + let list1_values = list1.into_iter().take(3); + + let list2 = vec![ + None, + Some(vec![Some(100), None, Some(101)]), + Some(vec![Some(102)]), + ]; + let list2_array = ListArray::from_iter_primitive::(list2.clone()); + + let array_result = concat(&[&list1_array, &list2_array]).unwrap(); + + let expected = list1_values.chain(list2); + let array_expected = ListArray::from_iter_primitive::(expected); + + assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); + } + #[test] fn test_concat_primitive_fixed_size_list_arrays() { let list1 = vec![ From e22760ea4356c0e978152173a008b25e1aede2da Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 28 Jan 2025 10:18:19 -0500 Subject: [PATCH 3/4] Additional asserts to ensure test covers the issue --- arrow-select/src/concat.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 1a4e81777b52..8592510cf08b 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -497,6 +497,8 @@ mod tests { ]; let list2_array = ListArray::from_iter_primitive::(list2.clone()); + // verify that this test covers the case when the first offset is non zero + assert!(list1_array.offsets()[0].as_usize() > 0); let array_result = concat(&[&list1_array, &list2_array]).unwrap(); let expected = list1_values.chain(list2); @@ -524,6 +526,10 @@ mod tests { ]; let list2_array = ListArray::from_iter_primitive::(list2.clone()); + // verify that this test covers the case when the first offset is zero, but the + // last offset doesn't cover the entire array + assert_eq!(list1_array.offsets()[0].as_usize(), 0); + assert!(list1_array.offsets().last().unwrap().as_usize() < list1_array.values().len()); let array_result = concat(&[&list1_array, &list2_array]).unwrap(); let expected = list1_values.chain(list2); From 7b0c82813945bb02ce562269daa8f71577ccbac0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 28 Jan 2025 15:24:08 -0500 Subject: [PATCH 4/4] Update arrow-select/src/concat.rs Co-authored-by: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> --- arrow-select/src/concat.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 8592510cf08b..1f453466dc9b 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -143,7 +143,7 @@ fn concat_lists( .inspect(|l| { output_len += l.len(); list_has_nulls |= l.null_count() != 0; - list_has_slices |= l.offsets()[0].as_usize() > 0 + list_has_slices |= l.offsets()[0] > OffsetSize::zero() || l.offsets().last().unwrap().as_usize() < l.values().len(); }) .collect::>();