Skip to content

Commit 51da8c2

Browse files
committed
Add StringViewArray and BinaryViewArray (apache#4253)
1 parent 735f48d commit 51da8c2

File tree

19 files changed

+1014
-98
lines changed

19 files changed

+1014
-98
lines changed

arrow-array/src/array/byte_array.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ pub struct GenericByteArray<T: ByteArrayType> {
9494
impl<T: ByteArrayType> Clone for GenericByteArray<T> {
9595
fn clone(&self) -> Self {
9696
Self {
97-
data_type: self.data_type.clone(),
97+
data_type: T::DATA_TYPE,
9898
value_offsets: self.value_offsets.clone(),
9999
value_data: self.value_data.clone(),
100100
nulls: self.nulls.clone(),
+346
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use crate::array::print_long_array;
19+
use crate::builder::GenericByteViewBuilder;
20+
use crate::iterator::ArrayIter;
21+
use crate::types::bytes::ByteArrayNativeType;
22+
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
23+
use crate::{Array, ArrayAccessor, ArrayRef};
24+
use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
25+
use arrow_data::view::View;
26+
use arrow_data::{ArrayData, ArrayDataBuilder};
27+
use arrow_schema::{ArrowError, DataType};
28+
use std::any::Any;
29+
use std::marker::PhantomData;
30+
use std::sync::Arc;
31+
32+
/// An array of variable length byte view arrays
33+
pub struct GenericByteViewArray<T: ByteViewType> {
34+
data_type: DataType,
35+
views: ScalarBuffer<u128>,
36+
buffers: Vec<Buffer>,
37+
nulls: Option<NullBuffer>,
38+
phantom: PhantomData<T>,
39+
}
40+
41+
impl<T: ByteViewType> Clone for GenericByteViewArray<T> {
42+
fn clone(&self) -> Self {
43+
Self {
44+
data_type: T::DATA_TYPE,
45+
views: self.views.clone(),
46+
buffers: self.buffers.clone(),
47+
nulls: self.nulls.clone(),
48+
phantom: Default::default(),
49+
}
50+
}
51+
}
52+
53+
impl<T: ByteViewType> GenericByteViewArray<T> {
54+
/// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
55+
///
56+
/// # Panics
57+
///
58+
/// Panics if [`GenericByteViewArray::try_new`] returns an error
59+
pub fn new(
60+
views: ScalarBuffer<u128>,
61+
buffers: Vec<Buffer>,
62+
nulls: Option<NullBuffer>,
63+
) -> Self {
64+
Self::try_new(views, buffers, nulls).unwrap()
65+
}
66+
67+
/// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
68+
///
69+
/// # Errors
70+
///
71+
/// * `views.len() != nulls.len()`
72+
/// * [ByteViewType::validate] fails
73+
pub fn try_new(
74+
views: ScalarBuffer<u128>,
75+
buffers: Vec<Buffer>,
76+
nulls: Option<NullBuffer>,
77+
) -> Result<Self, ArrowError> {
78+
// Verify data is valid
79+
T::validate(&views, &buffers)?;
80+
81+
if let Some(n) = nulls.as_ref() {
82+
if n.len() != views.len() {
83+
return Err(ArrowError::InvalidArgumentError(format!(
84+
"Incorrect length of null buffer for {}ViewArray, expected {} got {}",
85+
T::PREFIX,
86+
views.len(),
87+
n.len(),
88+
)));
89+
}
90+
}
91+
92+
Ok(Self {
93+
data_type: T::DATA_TYPE,
94+
phantom: Default::default(),
95+
views,
96+
buffers,
97+
nulls,
98+
})
99+
}
100+
101+
/// Create a new [`GenericByteViewArray`] from the provided parts, without validation
102+
///
103+
/// # Safety
104+
///
105+
/// Safe if [`Self::try_new`] would not error
106+
pub unsafe fn new_unchecked(
107+
views: ScalarBuffer<u128>,
108+
buffers: Vec<Buffer>,
109+
nulls: Option<NullBuffer>,
110+
) -> Self {
111+
Self {
112+
data_type: T::DATA_TYPE,
113+
phantom: Default::default(),
114+
views,
115+
buffers,
116+
nulls,
117+
}
118+
}
119+
120+
/// Create a new [`GenericByteViewArray`] of length `len` where all values are null
121+
pub fn new_null(len: usize) -> Self {
122+
Self {
123+
data_type: T::DATA_TYPE,
124+
views: vec![0; len].into(),
125+
buffers: vec![],
126+
nulls: Some(NullBuffer::new_null(len)),
127+
phantom: Default::default(),
128+
}
129+
}
130+
131+
/// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
132+
pub fn from_iter_values<Ptr, I>(iter: I) -> Self
133+
where
134+
Ptr: AsRef<T::Native>,
135+
I: IntoIterator<Item = Ptr>,
136+
{
137+
let iter = iter.into_iter();
138+
let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
139+
for v in iter {
140+
builder.append_value(v);
141+
}
142+
builder.finish()
143+
}
144+
145+
/// Deconstruct this array into its constituent parts
146+
pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) {
147+
(self.views, self.buffers, self.nulls)
148+
}
149+
150+
/// Returns the views buffer
151+
#[inline]
152+
pub fn views(&self) -> &ScalarBuffer<u128> {
153+
&self.views
154+
}
155+
156+
/// Returns the buffers storing string data
157+
#[inline]
158+
pub fn data_buffers(&self) -> &[Buffer] {
159+
&self.buffers
160+
}
161+
162+
/// Returns the element at index `i`
163+
/// # Panics
164+
/// Panics if index `i` is out of bounds.
165+
pub fn value(&self, i: usize) -> &T::Native {
166+
assert!(
167+
i < self.len(),
168+
"Trying to access an element at index {} from a {}ViewArray of length {}",
169+
i,
170+
T::PREFIX,
171+
self.len()
172+
);
173+
174+
assert!(i < self.views.len());
175+
unsafe { self.value_unchecked(i) }
176+
}
177+
178+
/// Returns the element at index `i`
179+
/// # Safety
180+
/// Caller is responsible for ensuring that the index is within the bounds of the array
181+
pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
182+
let v = self.views.get_unchecked(idx);
183+
let len = *v as u32;
184+
let b = if len <= 12 {
185+
let ptr = self.views.as_ptr() as *const u8;
186+
std::slice::from_raw_parts(ptr.add(idx * 16 + 4), len as usize)
187+
} else {
188+
let view = View::from(*v);
189+
let data = self.buffers.get_unchecked(view.buffer_index as usize);
190+
let offset = view.offset as usize;
191+
data.get_unchecked(offset..offset + len as usize)
192+
};
193+
T::Native::from_bytes_unchecked(b)
194+
}
195+
196+
/// constructs a new iterator
197+
pub fn iter(&self) -> ArrayIter<&Self> {
198+
ArrayIter::new(self)
199+
}
200+
201+
/// Returns a zero-copy slice of this array with the indicated offset and length.
202+
pub fn slice(&self, offset: usize, length: usize) -> Self {
203+
Self {
204+
data_type: T::DATA_TYPE,
205+
views: self.views.slice(offset, length),
206+
buffers: self.buffers.clone(),
207+
nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
208+
phantom: Default::default(),
209+
}
210+
}
211+
}
212+
213+
impl<T: ByteViewType> std::fmt::Debug for GenericByteViewArray<T> {
214+
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
215+
write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
216+
print_long_array(self, f, |array, index, f| {
217+
std::fmt::Debug::fmt(&array.value(index), f)
218+
})?;
219+
write!(f, "]")
220+
}
221+
}
222+
223+
impl<T: ByteViewType> Array for GenericByteViewArray<T> {
224+
fn as_any(&self) -> &dyn Any {
225+
self
226+
}
227+
228+
fn to_data(&self) -> ArrayData {
229+
self.clone().into()
230+
}
231+
232+
fn into_data(self) -> ArrayData {
233+
self.into()
234+
}
235+
236+
fn data_type(&self) -> &DataType {
237+
&self.data_type
238+
}
239+
240+
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
241+
Arc::new(self.slice(offset, length))
242+
}
243+
244+
fn len(&self) -> usize {
245+
self.views.len()
246+
}
247+
248+
fn is_empty(&self) -> bool {
249+
self.views.is_empty()
250+
}
251+
252+
fn offset(&self) -> usize {
253+
0
254+
}
255+
256+
fn nulls(&self) -> Option<&NullBuffer> {
257+
self.nulls.as_ref()
258+
}
259+
260+
fn get_buffer_memory_size(&self) -> usize {
261+
let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
262+
sum += self.views.inner().capacity();
263+
if let Some(x) = &self.nulls {
264+
sum += x.buffer().capacity()
265+
}
266+
sum
267+
}
268+
269+
fn get_array_memory_size(&self) -> usize {
270+
std::mem::size_of::<Self>() + self.get_buffer_memory_size()
271+
}
272+
}
273+
274+
impl<'a, T: ByteViewType> ArrayAccessor for &'a GenericByteViewArray<T> {
275+
type Item = &'a T::Native;
276+
277+
fn value(&self, index: usize) -> Self::Item {
278+
GenericByteViewArray::value(self, index)
279+
}
280+
281+
unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
282+
GenericByteViewArray::value_unchecked(self, index)
283+
}
284+
}
285+
286+
impl<'a, T: ByteViewType> IntoIterator for &'a GenericByteViewArray<T> {
287+
type Item = Option<&'a T::Native>;
288+
type IntoIter = ArrayIter<Self>;
289+
290+
fn into_iter(self) -> Self::IntoIter {
291+
ArrayIter::new(self)
292+
}
293+
}
294+
295+
impl<T: ByteViewType> From<ArrayData> for GenericByteViewArray<T> {
296+
fn from(value: ArrayData) -> Self {
297+
let views = value.buffers()[0].clone();
298+
let views = ScalarBuffer::new(views, value.offset(), value.len());
299+
let buffers = value.buffers()[1..].to_vec();
300+
Self {
301+
data_type: T::DATA_TYPE,
302+
views,
303+
buffers,
304+
nulls: value.nulls().cloned(),
305+
phantom: Default::default(),
306+
}
307+
}
308+
}
309+
310+
impl<T: ByteViewType> From<GenericByteViewArray<T>> for ArrayData {
311+
fn from(mut array: GenericByteViewArray<T>) -> Self {
312+
let len = array.len();
313+
array.buffers.insert(0, array.views.into_inner());
314+
let builder = ArrayDataBuilder::new(array.data_type)
315+
.len(len)
316+
.buffers(array.buffers)
317+
.nulls(array.nulls);
318+
319+
unsafe { builder.build_unchecked() }
320+
}
321+
}
322+
323+
impl<Ptr, T: ByteViewType> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
324+
where
325+
Ptr: AsRef<T::Native>,
326+
{
327+
fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
328+
let iter = iter.into_iter();
329+
let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
330+
builder.extend(iter);
331+
builder.finish()
332+
}
333+
}
334+
335+
/// A [`GenericByteViewArray`] of `str`
336+
///
337+
/// ```
338+
/// # use arrow_array::StringViewArray;
339+
/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "foo", "large payload over 12 bytes"]);
340+
/// assert_eq!(array.value(0), "hello");
341+
/// assert_eq!(array.value(3), "large payload over 12 bytes");
342+
/// ```
343+
pub type StringViewArray = GenericByteViewArray<StringViewType>;
344+
345+
/// A [`GenericByteViewArray`] of `[u8]`
346+
pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;

arrow-array/src/array/mod.rs

+3
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ pub use boolean_array::*;
3434
mod byte_array;
3535
pub use byte_array::*;
3636

37+
mod byte_view_array;
38+
pub use byte_view_array::*;
39+
3740
mod dictionary_array;
3841
pub use dictionary_array::*;
3942

0 commit comments

Comments
 (0)