Skip to content

Commit 07ae1dd

Browse files
authored
[Variant] Introduce new BorrowedShreddingState concept (#8499)
# Which issue does this PR close? - Relates to #8336 # Rationale for this change While pathfinding support for unshredding variant objects and arrays, I ran into the same issue that `variant_get` already suffers -- `ShreddingState` is inconvenient because it owns the `value` and `typed_value` columns it refers to, even tho borrowing them usually suffices. # What changes are included in this PR? Define a new `BorrowedShreddingState` which does what it says, and update `ShreddedPathStep` (in variant_get.rs) to use it. Also, make the constructor fallible in order to correctly report casting failures if the `value` column is not a binary view. # Are these changes tested? Yes, existing tests cover this code. # Are there any user-facing changes? New `TryFrom` and `From` implementations. `impl From<&StructArray> for ShreddingState` was replaced by a suitable `TryFrom`.
1 parent 686d085 commit 07ae1dd

File tree

3 files changed

+97
-40
lines changed

3 files changed

+97
-40
lines changed

parquet-variant-compute/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ mod variant_array_builder;
4646
pub mod variant_get;
4747
mod variant_to_arrow;
4848

49-
pub use variant_array::{ShreddingState, VariantArray, VariantType};
49+
pub use variant_array::{BorrowedShreddingState, ShreddingState, VariantArray, VariantType};
5050
pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder};
5151

5252
pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options};

parquet-variant-compute/src/variant_array.rs

Lines changed: 84 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -272,26 +272,11 @@ impl VariantArray {
272272
)));
273273
};
274274

275-
// Extract value and typed_value fields
276-
let value = if let Some(value_col) = inner.column_by_name("value") {
277-
if let Some(binary_view) = value_col.as_binary_view_opt() {
278-
Some(binary_view.clone())
279-
} else {
280-
return Err(ArrowError::NotYetImplemented(format!(
281-
"VariantArray 'value' field must be BinaryView, got {}",
282-
value_col.data_type()
283-
)));
284-
}
285-
} else {
286-
None
287-
};
288-
let typed_value = inner.column_by_name("typed_value").cloned();
289-
290275
// Note these clones are cheap, they just bump the ref count
291276
Ok(Self {
292277
inner: inner.clone(),
293278
metadata: metadata.clone(),
294-
shredding_state: ShreddingState::new(value, typed_value),
279+
shredding_state: ShreddingState::try_from(inner)?,
295280
})
296281
}
297282

@@ -521,7 +506,7 @@ impl ShreddedVariantFieldArray {
521506
// Note this clone is cheap, it just bumps the ref count
522507
Ok(Self {
523508
inner: inner_struct.clone(),
524-
shredding_state: ShreddingState::from(inner_struct),
509+
shredding_state: ShreddingState::try_from(inner_struct)?,
525510
})
526511
}
527512

@@ -660,7 +645,7 @@ impl ShreddingState {
660645
/// Create a new `ShreddingState` from the given `value` and `typed_value` fields
661646
///
662647
/// Note you can create a `ShreddingState` from a &[`StructArray`] using
663-
/// `ShreddingState::from(&struct_array)`, for example:
648+
/// `ShreddingState::try_from(&struct_array)`, for example:
664649
///
665650
/// ```no_run
666651
/// # use arrow::array::StructArray;
@@ -669,7 +654,7 @@ impl ShreddingState {
669654
/// # unimplemented!()
670655
/// # }
671656
/// let struct_array: StructArray = get_struct_array();
672-
/// let shredding_state = ShreddingState::from(&struct_array);
657+
/// let shredding_state = ShreddingState::try_from(&struct_array).unwrap();
673658
/// ```
674659
pub fn new(value: Option<BinaryViewArray>, typed_value: Option<ArrayRef>) -> Self {
675660
Self { value, typed_value }
@@ -685,6 +670,14 @@ impl ShreddingState {
685670
self.typed_value.as_ref()
686671
}
687672

673+
/// Returns a borrowed version of this shredding state
674+
pub fn borrow(&self) -> BorrowedShreddingState<'_> {
675+
BorrowedShreddingState {
676+
value: self.value_field(),
677+
typed_value: self.typed_value_field(),
678+
}
679+
}
680+
688681
/// Slice all the underlying arrays
689682
pub fn slice(&self, offset: usize, length: usize) -> Self {
690683
Self {
@@ -694,14 +687,79 @@ impl ShreddingState {
694687
}
695688
}
696689

697-
impl From<&StructArray> for ShreddingState {
698-
fn from(inner_struct: &StructArray) -> Self {
699-
let value = inner_struct
700-
.column_by_name("value")
701-
.and_then(|col| col.as_binary_view_opt().cloned());
702-
let typed_value = inner_struct.column_by_name("typed_value").cloned();
690+
/// Similar to [`ShreddingState`] except it holds borrowed references of the target arrays. Useful
691+
/// for avoiding clone operations when the caller does not need a self-standing shredding state.
692+
#[derive(Clone, Debug)]
693+
pub struct BorrowedShreddingState<'a> {
694+
value: Option<&'a BinaryViewArray>,
695+
typed_value: Option<&'a ArrayRef>,
696+
}
703697

704-
ShreddingState::new(value, typed_value)
698+
impl<'a> BorrowedShreddingState<'a> {
699+
/// Create a new `BorrowedShreddingState` from the given `value` and `typed_value` fields
700+
///
701+
/// Note you can create a `BorrowedShreddingState` from a &[`StructArray`] using
702+
/// `BorrowedShreddingState::try_from(&struct_array)`, for example:
703+
///
704+
/// ```no_run
705+
/// # use arrow::array::StructArray;
706+
/// # use parquet_variant_compute::BorrowedShreddingState;
707+
/// # fn get_struct_array() -> StructArray {
708+
/// # unimplemented!()
709+
/// # }
710+
/// let struct_array: StructArray = get_struct_array();
711+
/// let shredding_state = BorrowedShreddingState::try_from(&struct_array).unwrap();
712+
/// ```
713+
pub fn new(value: Option<&'a BinaryViewArray>, typed_value: Option<&'a ArrayRef>) -> Self {
714+
Self { value, typed_value }
715+
}
716+
717+
/// Return a reference to the value field, if present
718+
pub fn value_field(&self) -> Option<&'a BinaryViewArray> {
719+
self.value
720+
}
721+
722+
/// Return a reference to the typed_value field, if present
723+
pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
724+
self.typed_value
725+
}
726+
}
727+
728+
impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
729+
type Error = ArrowError;
730+
731+
fn try_from(inner_struct: &'a StructArray) -> Result<Self, ArrowError> {
732+
// The `value` column need not exist, but if it does it must be a binary view.
733+
let value = if let Some(value_col) = inner_struct.column_by_name("value") {
734+
let Some(binary_view) = value_col.as_binary_view_opt() else {
735+
return Err(ArrowError::NotYetImplemented(format!(
736+
"VariantArray 'value' field must be BinaryView, got {}",
737+
value_col.data_type()
738+
)));
739+
};
740+
Some(binary_view)
741+
} else {
742+
None
743+
};
744+
let typed_value = inner_struct.column_by_name("typed_value");
745+
Ok(BorrowedShreddingState::new(value, typed_value))
746+
}
747+
}
748+
749+
impl TryFrom<&StructArray> for ShreddingState {
750+
type Error = ArrowError;
751+
752+
fn try_from(inner_struct: &StructArray) -> Result<Self, ArrowError> {
753+
Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
754+
}
755+
}
756+
757+
impl From<BorrowedShreddingState<'_>> for ShreddingState {
758+
fn from(state: BorrowedShreddingState<'_>) -> Self {
759+
ShreddingState {
760+
value: state.value_field().cloned(),
761+
typed_value: state.typed_value_field().cloned(),
762+
}
705763
}
706764
}
707765

parquet-variant-compute/src/variant_get.rs

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,16 @@ use arrow::{
2323
use arrow_schema::{ArrowError, DataType, FieldRef};
2424
use parquet_variant::{VariantPath, VariantPathElement};
2525

26-
use crate::variant_array::ShreddingState;
26+
use crate::variant_array::BorrowedShreddingState;
2727
use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
2828
use crate::VariantArray;
2929

3030
use arrow::array::AsArray;
3131
use std::sync::Arc;
3232

33-
pub(crate) enum ShreddedPathStep {
33+
pub(crate) enum ShreddedPathStep<'a> {
3434
/// Path step succeeded, return the new shredding state
35-
Success(ShreddingState),
35+
Success(BorrowedShreddingState<'a>),
3636
/// The path element is not present in the `typed_value` column and there is no `value` column,
3737
/// so we know it does not exist. It, and all paths under it, are all-NULL.
3838
Missing,
@@ -47,18 +47,16 @@ pub(crate) enum ShreddedPathStep {
4747
/// level, or if `typed_value` is not a struct, or if the requested field name does not exist.
4848
///
4949
/// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe not even possible.
50-
pub(crate) fn follow_shredded_path_element(
51-
shredding_state: &ShreddingState,
50+
pub(crate) fn follow_shredded_path_element<'a>(
51+
shredding_state: &BorrowedShreddingState<'a>,
5252
path_element: &VariantPathElement<'_>,
5353
cast_options: &CastOptions,
54-
) -> Result<ShreddedPathStep> {
54+
) -> Result<ShreddedPathStep<'a>> {
5555
// If the requested path element is not present in `typed_value`, and `value` is missing, then
5656
// we know it does not exist; it, and all paths under it, are all-NULL.
57-
let missing_path_step = || {
58-
let Some(_value_field) = shredding_state.value_field() else {
59-
return ShreddedPathStep::Missing;
60-
};
61-
ShreddedPathStep::NotShredded
57+
let missing_path_step = || match shredding_state.value_field() {
58+
Some(_) => ShreddedPathStep::NotShredded,
59+
None => ShreddedPathStep::Missing,
6260
};
6361

6462
let Some(typed_value) = shredding_state.typed_value_field() else {
@@ -98,7 +96,8 @@ pub(crate) fn follow_shredded_path_element(
9896
))
9997
})?;
10098

101-
Ok(ShreddedPathStep::Success(struct_array.into()))
99+
let state = BorrowedShreddingState::try_from(struct_array)?;
100+
Ok(ShreddedPathStep::Success(state))
102101
}
103102
VariantPathElement::Index { .. } => {
104103
// TODO: Support array indexing. Among other things, it will require slicing not
@@ -152,7 +151,7 @@ fn shredded_get_path(
152151

153152
// Peel away the prefix of path elements that traverses the shredded parts of this variant
154153
// column. Shredding will traverse the rest of the path on a per-row basis.
155-
let mut shredding_state = input.shredding_state().clone();
154+
let mut shredding_state = input.shredding_state().borrow();
156155
let mut accumulated_nulls = input.inner().nulls().cloned();
157156
let mut path_index = 0;
158157
for path_element in path {

0 commit comments

Comments
 (0)