diff --git a/encodings/alp/src/alp/compute/between.rs b/encodings/alp/src/alp/compute/between.rs index 577d3d4243..c959076f34 100644 --- a/encodings/alp/src/alp/compute/between.rs +++ b/encodings/alp/src/alp/compute/between.rs @@ -96,7 +96,7 @@ mod tests { .unwrap() .to_bool() .unwrap() - .boolean_buffer() + .bit_buffer() .iter() .collect_vec(); assert_eq!(res.len(), 1); diff --git a/encodings/alp/src/alp/compute/compare.rs b/encodings/alp/src/alp/compute/compare.rs index e991102a53..86e3772d69 100644 --- a/encodings/alp/src/alp/compute/compare.rs +++ b/encodings/alp/src/alp/compute/compare.rs @@ -141,7 +141,7 @@ mod tests { { alp_scalar_compare(alp, value, operator) .unwrap() - .map(|a| a.to_bool().unwrap().boolean_buffer().iter().collect()) + .map(|a| a.to_bool().unwrap().bit_buffer().iter().collect()) } #[test] @@ -160,7 +160,7 @@ mod tests { .to_bool() .unwrap(); - for v in r.boolean_buffer().iter() { + for v in r.bit_buffer().iter() { assert!(!v); } @@ -170,7 +170,7 @@ mod tests { .to_bool() .unwrap(); - for v in r.boolean_buffer().iter() { + for v in r.bit_buffer().iter() { assert!(v); } } @@ -192,7 +192,7 @@ mod tests { .to_bool() .unwrap(); - assert!(r_eq.boolean_buffer().iter().all(|v| !v)); + assert!(r_eq.bit_buffer().iter().all(|v| !v)); #[allow(clippy::excessive_precision)] let r_neq = alp_scalar_compare(&encoded, 1.234444f32, Operator::NotEq) @@ -201,7 +201,7 @@ mod tests { .to_bool() .unwrap(); - assert!(r_neq.boolean_buffer().iter().all(|v| v)); + assert!(r_neq.bit_buffer().iter().all(|v| v)); } #[test] @@ -221,7 +221,7 @@ mod tests { .unwrap(); // !(0.0605_f32 >= 0.06051_f32); - assert!(r_gte.boolean_buffer().iter().all(|v| !v)); + assert!(r_gte.bit_buffer().iter().all(|v| !v)); let r_gt = alp_scalar_compare(&encoded, 0.06051_f32, Operator::Gt) .unwrap() @@ -230,7 +230,7 @@ mod tests { .unwrap(); // (0.0605_f32 > 0.06051_f32); - assert!(r_gt.boolean_buffer().iter().all(|v| !v)); + assert!(r_gt.bit_buffer().iter().all(|v| !v)); let r_lte = alp_scalar_compare(&encoded, 0.06051_f32, Operator::Lte) .unwrap() @@ -239,7 +239,7 @@ mod tests { .unwrap(); // 0.0605_f32 <= 0.06051_f32; - assert!(r_lte.boolean_buffer().iter().all(|v| v)); + assert!(r_lte.bit_buffer().iter().all(|v| v)); let r_lt = alp_scalar_compare(&encoded, 0.06051_f32, Operator::Lt) .unwrap() @@ -248,7 +248,7 @@ mod tests { .unwrap(); //0.0605_f32 < 0.06051_f32; - assert!(r_lt.boolean_buffer().iter().all(|v| v)); + assert!(r_lt.bit_buffer().iter().all(|v| v)); } #[test] @@ -313,7 +313,7 @@ mod tests { .to_bool() .unwrap(); - for v in r.boolean_buffer().iter() { + for v in r.bit_buffer().iter() { assert!(!v); } } diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs index 824af7936f..6bae881b78 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs @@ -93,7 +93,7 @@ mod tests { assert_eq!( res.to_bool() .unwrap() - .boolean_buffer() + .bit_buffer() .iter() .collect::>(), vec![false, false, true] diff --git a/encodings/dict/src/compute/compare.rs b/encodings/dict/src/compute/compare.rs index 261ecca37c..7b342fb770 100644 --- a/encodings/dict/src/compute/compare.rs +++ b/encodings/dict/src/compute/compare.rs @@ -54,7 +54,7 @@ fn dict_equal_to( ) -> VortexResult { let bool_result = values_compare.to_bool()?; let result_validity = bool_result.validity_mask()?; - let bool_buffer = bool_result.boolean_buffer(); + let bool_buffer = bool_result.bit_buffer(); let (first_match, second_match) = match result_validity.boolean_buffer() { AllOr::All => { let mut indices_iter = bool_buffer.set_indices(); @@ -147,7 +147,7 @@ mod tests { .unwrap(); let res = res.to_bool().unwrap(); assert_eq!( - res.boolean_buffer().iter().collect::>(), + res.bit_buffer().iter().collect::>(), vec![true, false, false] ); } @@ -168,7 +168,7 @@ mod tests { .unwrap(); let res = res.to_bool().unwrap(); assert_eq!( - res.boolean_buffer().iter().collect::>(), + res.bit_buffer().iter().collect::>(), vec![false, true, true] ); } @@ -193,7 +193,7 @@ mod tests { .unwrap(); let res = res.to_bool().unwrap(); assert_eq!( - res.boolean_buffer().iter().collect::>(), + res.bit_buffer().iter().collect::>(), vec![false, false, false] ); assert_eq!(res.dtype().nullability(), Nullability::Nullable); @@ -223,7 +223,7 @@ mod tests { .unwrap(); let res = res.to_bool().unwrap(); assert_eq!( - res.boolean_buffer().iter().collect::>(), + res.bit_buffer().iter().collect::>(), vec![false, false, false] ); assert_eq!(res.dtype().nullability(), Nullability::Nullable); diff --git a/encodings/dict/src/compute/fill_null.rs b/encodings/dict/src/compute/fill_null.rs index 98cabe22c4..00bc535786 100644 --- a/encodings/dict/src/compute/fill_null.rs +++ b/encodings/dict/src/compute/fill_null.rs @@ -17,7 +17,7 @@ impl FillNullKernel for DictVTable { )? .to_bool()?; - let Some(first_fill_value) = found_fill_values.boolean_buffer().set_indices().next() else { + let Some(first_fill_value) = found_fill_values.bit_buffer().set_indices().next() else { // No fill values found, so we must canonicalize and fill_null. // TODO(ngates): compute kernels should all return Option to support this // fall back. diff --git a/encodings/fastlanes/src/for/compute/compare.rs b/encodings/fastlanes/src/for/compute/compare.rs index e145bc3177..0e93598eb4 100644 --- a/encodings/fastlanes/src/for/compute/compare.rs +++ b/encodings/fastlanes/src/for/compute/compare.rs @@ -187,6 +187,6 @@ mod tests { expected: T, ) { let result = result.unwrap().unwrap().to_bool().unwrap(); - assert_eq!(result.boolean_buffer(), &BooleanBuffer::from_iter(expected)); + assert_eq!(result.bit_buffer(), &BooleanBuffer::from_iter(expected)); } } diff --git a/encodings/fsst/src/compute/compare.rs b/encodings/fsst/src/compute/compare.rs index 4da0f72f91..b6e63711ce 100644 --- a/encodings/fsst/src/compute/compare.rs +++ b/encodings/fsst/src/compute/compare.rs @@ -139,7 +139,7 @@ mod tests { assert_eq!(equals.dtype(), &DType::Bool(Nullability::Nullable)); assert_eq!( - equals.boolean_buffer().into_iter().collect::>(), + equals.bit_buffer().into_iter().collect::>(), vec![false, false, true, false, false] ); @@ -151,7 +151,7 @@ mod tests { assert_eq!(not_equals.dtype(), &DType::Bool(Nullability::Nullable)); assert_eq!( - not_equals.boolean_buffer().into_iter().collect::>(), + not_equals.bit_buffer().into_iter().collect::>(), vec![true, true, false, true, true] ); diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index 82660451b7..d36ca12f52 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -188,7 +188,7 @@ impl ValidityVTable for RunEndVTable { ) .vortex_expect("invalid array") .into_array(); - Mask::from_buffer(ree_validity.to_bool()?.boolean_buffer().clone()) + Mask::from_buffer(ree_validity.to_bool()?.bit_buffer().clone()) } }) } diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index 6ba386e1f3..b5643bd71a 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -25,7 +25,7 @@ pub fn runend_encode(array: &PrimitiveArray) -> VortexResult<(PrimitiveArray, Ar ConstantArray::new(Scalar::null(array.dtype().clone()), 1).into_array(), )); } - Validity::Array(a) => Some(a.to_bool()?.boolean_buffer().clone()), + Validity::Array(a) => Some(a.to_bool()?.bit_buffer().clone()), }; let (ends, values) = match validity { diff --git a/encodings/runend/src/compute/compare.rs b/encodings/runend/src/compute/compare.rs index d113d6cd68..979f60a2ab 100644 --- a/encodings/runend/src/compute/compare.rs +++ b/encodings/runend/src/compute/compare.rs @@ -65,7 +65,7 @@ mod test { .unwrap(); let res_canon = res.to_bool().unwrap(); assert_eq!( - res_canon.boolean_buffer(), + res_canon.bit_buffer(), &BooleanBuffer::from(vec![ false, false, false, false, false, false, false, false, true, true, true, true ]) diff --git a/encodings/sparse/src/canonical.rs b/encodings/sparse/src/canonical.rs index 537961478a..110a57174d 100644 --- a/encodings/sparse/src/canonical.rs +++ b/encodings/sparse/src/canonical.rs @@ -123,13 +123,13 @@ mod test { fill_value, ); - assert_eq!(flat_bools.boolean_buffer(), expected.boolean_buffer()); + assert_eq!(flat_bools.bit_buffer(), expected.bit_buffer()); assert_eq!(flat_bools.validity(), expected.validity()); - assert!(flat_bools.boolean_buffer().value(0)); + assert!(flat_bools.bit_buffer().value(0)); assert!(flat_bools.validity().is_valid(0).unwrap()); assert_eq!( - flat_bools.boolean_buffer().value(1), + flat_bools.bit_buffer().value(1), fill_value.unwrap_or_default() ); assert!(!flat_bools.validity().is_valid(1).unwrap()); @@ -137,7 +137,7 @@ mod test { flat_bools.validity().is_valid(2).unwrap(), fill_value.is_some() ); - assert!(!flat_bools.boolean_buffer().value(7)); + assert!(!flat_bools.bit_buffer().value(7)); assert!(flat_bools.validity().is_valid(7).unwrap()); } diff --git a/encodings/sparse/src/lib.rs b/encodings/sparse/src/lib.rs index b725597635..99ce29ae1d 100644 --- a/encodings/sparse/src/lib.rs +++ b/encodings/sparse/src/lib.rs @@ -192,7 +192,7 @@ impl SparseArray { &Scalar::bool(true, Nullability::NonNullable), )? .to_bool()? - .boolean_buffer() + .bit_buffer() .clone(), ); diff --git a/fuzz/fuzz_targets/file_io.rs b/fuzz/fuzz_targets/file_io.rs index 096d1cb182..66fb7156f5 100644 --- a/fuzz/fuzz_targets/file_io.rs +++ b/fuzz/fuzz_targets/file_io.rs @@ -72,7 +72,7 @@ fuzz_target!(|array_data: ArbitraryArray| -> Corpus { .vortex_unwrap() .to_bool() .vortex_unwrap(); - let true_count = bool_result.boolean_buffer().count_set_bits(); + let true_count = bool_result.bit_buffer().count_set_bits(); if true_count != array_data.len() && (bool_result.all_valid().vortex_unwrap() || array_data.all_valid().vortex_unwrap()) diff --git a/fuzz/src/compare.rs b/fuzz/src/compare.rs index 7ad5c9eca4..caa06f913e 100644 --- a/fuzz/src/compare.rs +++ b/fuzz/src/compare.rs @@ -32,7 +32,7 @@ pub fn compare_canonical_array( Ok(compare_to( array .to_bool()? - .boolean_buffer() + .bit_buffer() .iter() .zip(array.validity_mask()?.to_boolean_buffer().iter()) .map(|(b, v)| v.then_some(b)), diff --git a/fuzz/src/filter.rs b/fuzz/src/filter.rs index 773b6d68d9..9e7501fa96 100644 --- a/fuzz/src/filter.rs +++ b/fuzz/src/filter.rs @@ -31,7 +31,7 @@ pub fn filter_canonical_array(array: &dyn Array, filter: &[bool]) -> VortexResul BooleanBuffer::from_iter( filter .iter() - .zip(bool_array.boolean_buffer().iter()) + .zip(bool_array.bit_buffer().iter()) .filter(|(f, _)| **f) .map(|(_, v)| v), ), diff --git a/fuzz/src/search_sorted.rs b/fuzz/src/search_sorted.rs index 7a657033fb..a0882c5a8a 100644 --- a/fuzz/src/search_sorted.rs +++ b/fuzz/src/search_sorted.rs @@ -53,7 +53,7 @@ pub fn search_sorted_canonical_array( let bool_array = array.to_bool()?; let validity = bool_array.validity_mask()?.to_boolean_buffer(); let opt_values = bool_array - .boolean_buffer() + .bit_buffer() .iter() .zip(validity.iter()) .map(|(b, v)| v.then_some(b)) diff --git a/fuzz/src/slice.rs b/fuzz/src/slice.rs index 9c9babc0f9..533606aa9e 100644 --- a/fuzz/src/slice.rs +++ b/fuzz/src/slice.rs @@ -22,7 +22,7 @@ pub fn slice_canonical_array( match array.dtype() { DType::Bool(_) => { let bool_array = array.to_bool()?; - let sliced_bools = bool_array.boolean_buffer().slice(start, stop - start); + let sliced_bools = bool_array.bit_buffer().slice(start, stop - start); Ok(BoolArray::new(sliced_bools, validity).into_array()) } DType::Primitive(p, _) => { diff --git a/fuzz/src/sort.rs b/fuzz/src/sort.rs index 101203431d..499be5ae31 100644 --- a/fuzz/src/sort.rs +++ b/fuzz/src/sort.rs @@ -13,7 +13,7 @@ pub fn sort_canonical_array(array: &dyn Array) -> VortexResult { DType::Bool(_) => { let bool_array = array.to_bool()?; let mut opt_values = bool_array - .boolean_buffer() + .bit_buffer() .iter() .zip(bool_array.validity_mask()?.to_boolean_buffer().iter()) .map(|(b, v)| v.then_some(b)) diff --git a/fuzz/src/take.rs b/fuzz/src/take.rs index c75d88562e..d7b85a9407 100644 --- a/fuzz/src/take.rs +++ b/fuzz/src/take.rs @@ -20,7 +20,7 @@ pub fn take_canonical_array(array: &dyn Array, indices: &[usize]) -> VortexResul match array.dtype() { DType::Bool(_) => { let bool_array = array.to_bool()?; - let vec_values = bool_array.boolean_buffer().iter().collect::>(); + let vec_values = bool_array.bit_buffer().iter().collect::>(); Ok( BoolArray::new(indices.iter().map(|i| vec_values[*i]).collect(), validity) .into_array(), diff --git a/vortex-array/src/arrays/bool/array.rs b/vortex-array/src/arrays/bool/array.rs index a34721bca5..98e595da54 100644 --- a/vortex-array/src/arrays/bool/array.rs +++ b/vortex-array/src/arrays/bool/array.rs @@ -1,5 +1,5 @@ use arrow_array::BooleanArray; -use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder, MutableBuffer}; +use vortex_buffer::{BitBuffer, BitBufferMut}; use vortex_dtype::DType; use vortex_error::{VortexResult, vortex_panic}; @@ -13,7 +13,7 @@ use crate::vtable::{ArrayVTable, CanonicalVTable, ValidityHelper}; #[derive(Clone, Debug)] pub struct BoolArray { dtype: DType, - buffer: BooleanBuffer, + buffer: BitBuffer, pub(crate) validity: Validity, pub(crate) stats_set: ArrayStats, } @@ -22,19 +22,15 @@ impl BoolArray { /// Create a new BoolArray from a set of indices and a length. /// All indices must be less than the length. pub fn from_indices>(length: usize, indices: I) -> Self { - let mut buffer = MutableBuffer::new_null(length); - indices - .into_iter() - .for_each(|idx| arrow_buffer::bit_util::set_bit(&mut buffer, idx)); - Self::new( - BooleanBufferBuilder::new_from_buffer(buffer, length).finish(), - Validity::NonNullable, - ) + let mut buffer = BitBufferMut::new(length); + + indices.into_iter().for_each(|idx| buffer.set(idx)); + Self::new(buffer.freeze(), Validity::NonNullable) } - /// Creates a new [`BoolArray`] from a [`BooleanBuffer`] and [`Validity`], without checking + /// Creates a new [`BoolArray`] from a [`BitBuffer`] and [`Validity`], without checking /// any invariants. - pub fn new(buffer: BooleanBuffer, validity: Validity) -> Self { + pub fn new(buffer: BitBuffer, validity: Validity) -> Self { if let Some(len) = validity.maybe_len() { if buffer.len() != len { vortex_panic!( @@ -56,7 +52,7 @@ impl BoolArray { } /// Returns the underlying [`BooleanBuffer`] of the array. - pub fn boolean_buffer(&self) -> &BooleanBuffer { + pub fn bit_buffer(&self) -> &BitBuffer { assert!( self.buffer.offset() < 8, "Offset must be <8, did we forget to call shrink_offset? Found {}", @@ -71,7 +67,7 @@ impl BoolArray { /// otherwise a copy is created. /// /// The second value of the tuple is a bit_offset of first value in first byte of the returned builder - pub fn into_boolean_builder(self) -> (BooleanBufferBuilder, usize) { + pub fn into_bit_mut(self) -> (BitBufferMut, usize) { let offset = self.buffer.offset(); let len = self.buffer.len(); let arrow_buffer = self.buffer.into_inner(); @@ -94,15 +90,15 @@ impl BoolArray { } } -impl From for BoolArray { - fn from(value: BooleanBuffer) -> Self { +impl From for BoolArray { + fn from(value: BitBuffer) -> Self { Self::new(value, Validity::NonNullable) } } impl FromIterator for BoolArray { fn from_iter>(iter: T) -> Self { - Self::new(BooleanBuffer::from_iter(iter), Validity::NonNullable) + Self::new(BitBuffer::from_iter(iter), Validity::NonNullable) } } @@ -147,27 +143,10 @@ impl CanonicalVTable for BoolVTable { } } -pub trait BooleanBufferExt { - /// Slice any full bytes from the buffer, leaving the offset < 8. - fn shrink_offset(self) -> Self; -} - -impl BooleanBufferExt for BooleanBuffer { - fn shrink_offset(self) -> Self { - let byte_offset = self.offset() / 8; - let bit_offset = self.offset() % 8; - let len = self.len(); - let buffer = self - .into_inner() - .slice_with_length(byte_offset, (len + bit_offset).div_ceil(8)); - BooleanBuffer::new(buffer, bit_offset, len) - } -} - #[cfg(test)] mod tests { use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder}; - use vortex_buffer::buffer; + use vortex_buffer::{BitBuffer, buffer}; use crate::arrays::{BoolArray, PrimitiveArray}; use crate::compute::conformance::mask::test_mask; @@ -225,7 +204,7 @@ mod tests { }; let sliced = arr.slice(4, 12).unwrap(); let sliced_len = sliced.len(); - let (values, offset) = sliced.to_bool().unwrap().into_boolean_builder(); + let (values, offset) = sliced.to_bool().unwrap().into_bit_mut(); assert_eq!(offset, 4); assert_eq!(values.as_slice(), &[254, 15]); @@ -238,13 +217,13 @@ mod tests { ); let arr = arr.patch(&patches).unwrap(); let arr_len = arr.len(); - let (values, offset) = arr.to_bool().unwrap().into_boolean_builder(); + let (values, offset) = arr.to_bool().unwrap().into_bit_mut(); assert_eq!(offset, 0); assert_eq!(values.len(), arr_len + offset); assert_eq!(values.as_slice(), &[238, 15]); // the slice should be unchanged - let (values, offset) = sliced.to_bool().unwrap().into_boolean_builder(); + let (values, offset) = sliced.to_bool().unwrap().into_bit_mut(); assert_eq!(offset, 4); assert_eq!(values.len(), sliced_len + offset); assert_eq!(values.as_slice(), &[254, 15]); // unchanged @@ -255,7 +234,7 @@ mod tests { let arr = BoolArray::from(BooleanBuffer::new_set(16)); let sliced = arr.slice(4, 12).unwrap(); let sliced_len = sliced.len(); - let (values, offset) = sliced.to_bool().unwrap().into_boolean_builder(); + let (values, offset) = sliced.to_bool().unwrap().into_bit_mut(); assert_eq!(offset, 4); assert_eq!(values.len(), sliced_len + offset); assert_eq!(values.as_slice(), &[255, 15]); @@ -264,10 +243,9 @@ mod tests { #[test] #[should_panic] fn patch_bools_owned() { - let buffer = buffer![255u8; 2]; - let buf = BooleanBuffer::new(buffer.into_arrow_buffer(), 0, 15); + let buf = BitBuffer::new_set(16); let arr = BoolArray::new(buf, Validity::NonNullable); - let buf_ptr = arr.boolean_buffer().sliced().as_ptr(); + let buf_ptr = arr.bit_buffer().sliced().as_ptr(); let patches = Patches::new( arr.len(), @@ -276,9 +254,9 @@ mod tests { BoolArray::from(BooleanBuffer::new_unset(1)).into_array(), ); let arr = arr.patch(&patches).unwrap(); - assert_eq!(arr.boolean_buffer().sliced().as_ptr(), buf_ptr); + assert_eq!(arr.bit_buffer().sliced().as_ptr(), buf_ptr); - let (values, _byte_bit_offset) = arr.to_bool().unwrap().into_boolean_builder(); + let (values, _byte_bit_offset) = arr.to_bool().unwrap().into_bit_mut(); assert_eq!(values.as_slice(), &[254, 127]); } diff --git a/vortex-array/src/arrays/bool/compute/cast.rs b/vortex-array/src/arrays/bool/compute/cast.rs index 9408fc78ea..d2e503cf1b 100644 --- a/vortex-array/src/arrays/bool/compute/cast.rs +++ b/vortex-array/src/arrays/bool/compute/cast.rs @@ -15,7 +15,7 @@ impl CastKernel for BoolVTable { let new_nullability = dtype.nullability(); let new_validity = array.validity().clone().cast_nullability(new_nullability)?; - Ok(BoolArray::new(array.boolean_buffer().clone(), new_validity).to_array()) + Ok(BoolArray::new(array.bit_buffer().clone(), new_validity).to_array()) } } diff --git a/vortex-array/src/arrays/bool/compute/fill_null.rs b/vortex-array/src/arrays/bool/compute/fill_null.rs index 48b3039b16..680fa020e0 100644 --- a/vortex-array/src/arrays/bool/compute/fill_null.rs +++ b/vortex-array/src/arrays/bool/compute/fill_null.rs @@ -16,7 +16,7 @@ impl FillNullKernel for BoolVTable { Ok(match array.validity() { Validity::NonNullable | Validity::AllValid => BoolArray::new( - array.boolean_buffer().clone(), + array.bit_buffer().clone(), fill_value.dtype().nullability().into(), ) .into_array(), @@ -25,9 +25,9 @@ impl FillNullKernel for BoolVTable { } Validity::Array(v) => { let bool_buffer = if fill { - array.boolean_buffer() | &!v.to_bool()?.boolean_buffer() + array.bit_buffer() | &!v.to_bool()?.bit_buffer() } else { - array.boolean_buffer() & v.to_bool()?.boolean_buffer() + array.bit_buffer() & v.to_bool()?.bit_buffer() }; BoolArray::new(bool_buffer, fill_value.dtype().nullability().into()).into_array() } @@ -61,7 +61,7 @@ mod tests { .to_bool() .unwrap(); assert_eq!( - non_null_array.boolean_buffer().iter().collect::>(), + non_null_array.bit_buffer().iter().collect::>(), expected ); assert_eq!( diff --git a/vortex-array/src/arrays/bool/compute/filter.rs b/vortex-array/src/arrays/bool/compute/filter.rs index 3b35615559..0c634019df 100644 --- a/vortex-array/src/arrays/bool/compute/filter.rs +++ b/vortex-array/src/arrays/bool/compute/filter.rs @@ -20,12 +20,12 @@ impl FilterKernel for BoolVTable { let buffer = match mask_values.threshold_iter(FILTER_SLICES_DENSITY_THRESHOLD) { MaskIter::Indices(indices) => filter_indices( - array.boolean_buffer(), + array.bit_buffer(), mask.true_count(), indices.iter().copied(), ), MaskIter::Slices(slices) => filter_slices( - array.boolean_buffer(), + array.bit_buffer(), mask.true_count(), slices.iter().copied(), ), @@ -91,7 +91,7 @@ mod test { assert_eq!( vec![true, false], - filtered.boolean_buffer().iter().collect_vec() + filtered.bit_buffer().iter().collect_vec() ) } @@ -99,7 +99,7 @@ mod test { fn filter_bool_by_slice_test() { let arr = BoolArray::from_iter([true, true, false]); - let filtered = filter_slices(arr.boolean_buffer(), 2, [(0, 1), (2, 3)].into_iter()); + let filtered = filter_slices(arr.bit_buffer(), 2, [(0, 1), (2, 3)].into_iter()); assert_eq!(2, filtered.len()); assert_eq!(vec![true, false], filtered.iter().collect_vec()) @@ -109,7 +109,7 @@ mod test { fn filter_bool_by_index_test() { let arr = BoolArray::from_iter([true, true, false]); - let filtered = filter_indices(arr.boolean_buffer(), 2, [0, 2].into_iter()); + let filtered = filter_indices(arr.bit_buffer(), 2, [0, 2].into_iter()); assert_eq!(2, filtered.len()); assert_eq!(vec![true, false], filtered.iter().collect_vec()) diff --git a/vortex-array/src/arrays/bool/compute/invert.rs b/vortex-array/src/arrays/bool/compute/invert.rs index 18bec2c9ed..f9dbceffe9 100644 --- a/vortex-array/src/arrays/bool/compute/invert.rs +++ b/vortex-array/src/arrays/bool/compute/invert.rs @@ -9,7 +9,7 @@ use crate::{ArrayRef, IntoArray, register_kernel}; impl InvertKernel for BoolVTable { fn invert(&self, array: &BoolArray) -> VortexResult { - Ok(BoolArray::new(array.boolean_buffer().not(), array.validity().clone()).into_array()) + Ok(BoolArray::new(array.bit_buffer().not(), array.validity().clone()).into_array()) } } diff --git a/vortex-array/src/arrays/bool/compute/is_constant.rs b/vortex-array/src/arrays/bool/compute/is_constant.rs index 4907f15d62..3989c1261e 100644 --- a/vortex-array/src/arrays/bool/compute/is_constant.rs +++ b/vortex-array/src/arrays/bool/compute/is_constant.rs @@ -11,7 +11,7 @@ impl IsConstantKernel for BoolVTable { return Ok(None); } - let buffer = array.boolean_buffer(); + let buffer = array.bit_buffer(); // Safety: // We must have at least one value at this point diff --git a/vortex-array/src/arrays/bool/compute/is_sorted.rs b/vortex-array/src/arrays/bool/compute/is_sorted.rs index a2b55e1980..efe714a3e3 100644 --- a/vortex-array/src/arrays/bool/compute/is_sorted.rs +++ b/vortex-array/src/arrays/bool/compute/is_sorted.rs @@ -9,10 +9,10 @@ impl IsSortedKernel for BoolVTable { fn is_sorted(&self, array: &BoolArray) -> VortexResult { match array.validity_mask()? { Mask::AllFalse(_) => Ok(true), - Mask::AllTrue(_) => Ok(array.boolean_buffer().iter().is_sorted()), + Mask::AllTrue(_) => Ok(array.bit_buffer().iter().is_sorted()), Mask::Values(mask_values) => { let set_indices = mask_values.boolean_buffer().set_indices(); - let values = array.boolean_buffer(); + let values = array.bit_buffer(); let values_iter = set_indices.map(|idx| // Safety: // All idxs are in-bounds for the array. @@ -28,10 +28,10 @@ impl IsSortedKernel for BoolVTable { fn is_strict_sorted(&self, array: &BoolArray) -> VortexResult { match array.validity_mask()? { Mask::AllFalse(_) => Ok(false), - Mask::AllTrue(_) => Ok(array.boolean_buffer().iter().is_strict_sorted()), + Mask::AllTrue(_) => Ok(array.bit_buffer().iter().is_strict_sorted()), Mask::Values(mask_values) => { let validity_buffer = mask_values.boolean_buffer(); - let values = array.boolean_buffer(); + let values = array.bit_buffer(); Ok(validity_buffer .iter() diff --git a/vortex-array/src/arrays/bool/compute/mask.rs b/vortex-array/src/arrays/bool/compute/mask.rs index d1a4cdabb7..08a6b2227e 100644 --- a/vortex-array/src/arrays/bool/compute/mask.rs +++ b/vortex-array/src/arrays/bool/compute/mask.rs @@ -8,10 +8,7 @@ use crate::{ArrayRef, IntoArray, register_kernel}; impl MaskKernel for BoolVTable { fn mask(&self, array: &BoolArray, mask: &Mask) -> VortexResult { - Ok( - BoolArray::new(array.boolean_buffer().clone(), array.validity().mask(mask)?) - .into_array(), - ) + Ok(BoolArray::new(array.bit_buffer().clone(), array.validity().mask(mask)?).into_array()) } } diff --git a/vortex-array/src/arrays/bool/compute/min_max.rs b/vortex-array/src/arrays/bool/compute/min_max.rs index a200241673..bda61412bc 100644 --- a/vortex-array/src/arrays/bool/compute/min_max.rs +++ b/vortex-array/src/arrays/bool/compute/min_max.rs @@ -11,9 +11,9 @@ use crate::register_kernel; impl MinMaxKernel for BoolVTable { fn min_max(&self, array: &BoolArray) -> VortexResult> { let x = match array.validity_mask()? { - Mask::AllTrue(_) => array.boolean_buffer().clone(), + Mask::AllTrue(_) => array.bit_buffer().clone(), Mask::AllFalse(_) => return Ok(None), - Mask::Values(v) => array.boolean_buffer().bitand(v.boolean_buffer()), + Mask::Values(v) => array.bit_buffer().bitand(v.boolean_buffer()), }; // TODO(ngates): we should be able to bail out earlier as soon as we have one true and diff --git a/vortex-array/src/arrays/bool/compute/sum.rs b/vortex-array/src/arrays/bool/compute/sum.rs index a0f53743d6..6d4e3bde8c 100644 --- a/vortex-array/src/arrays/bool/compute/sum.rs +++ b/vortex-array/src/arrays/bool/compute/sum.rs @@ -13,18 +13,15 @@ impl SumKernel for BoolVTable { let true_count: Option = match array.validity_mask()?.boolean_buffer() { AllOr::All => { // All-valid - Some(array.boolean_buffer().count_set_bits() as u64) + Some(array.bit_buffer().count_set_bits() as u64) } AllOr::None => { // All-invalid unreachable!("All-invalid boolean array should have been handled by entry-point") } - AllOr::Some(validity_mask) => Some( - array - .boolean_buffer() - .bitand(validity_mask) - .count_set_bits() as u64, - ), + AllOr::Some(validity_mask) => { + Some(array.bit_buffer().bitand(validity_mask).count_set_bits() as u64) + } }; Ok(Scalar::from(true_count)) } diff --git a/vortex-array/src/arrays/bool/compute/take.rs b/vortex-array/src/arrays/bool/compute/take.rs index 06c92a6400..850c3129d7 100644 --- a/vortex-array/src/arrays/bool/compute/take.rs +++ b/vortex-array/src/arrays/bool/compute/take.rs @@ -92,8 +92,8 @@ mod test { .to_bool() .unwrap(); assert_eq!( - b.boolean_buffer(), - BoolArray::from_iter([Some(false), None, Some(false)]).boolean_buffer() + b.bit_buffer(), + BoolArray::from_iter([Some(false), None, Some(false)]).bit_buffer() ); let nullable_bool_dtype = DType::Bool(Nullability::Nullable); diff --git a/vortex-array/src/arrays/bool/ops.rs b/vortex-array/src/arrays/bool/ops.rs index e88c317933..8ba87d9583 100644 --- a/vortex-array/src/arrays/bool/ops.rs +++ b/vortex-array/src/arrays/bool/ops.rs @@ -8,7 +8,7 @@ use crate::{ArrayRef, IntoArray}; impl OperationsVTable for BoolVTable { fn slice(array: &BoolArray, start: usize, stop: usize) -> VortexResult { Ok(BoolArray::new( - array.boolean_buffer().slice(start, stop - start), + array.bit_buffer().slice(start, stop - start), array.validity().slice(start, stop)?, ) .into_array()) @@ -16,7 +16,7 @@ impl OperationsVTable for BoolVTable { fn scalar_at(array: &BoolArray, index: usize) -> VortexResult { Ok(Scalar::bool( - array.boolean_buffer().value(index), + array.bit_buffer().value(index), array.dtype().nullability(), )) } @@ -32,8 +32,8 @@ mod tests { let arr = BoolArray::from_iter(std::iter::repeat_n(Some(true), 100)); let sliced_arr = arr.slice(8, 16).unwrap().to_bool().unwrap(); assert_eq!(sliced_arr.len(), 8); - assert_eq!(sliced_arr.boolean_buffer().len(), 8); - assert_eq!(sliced_arr.boolean_buffer().offset(), 0); + assert_eq!(sliced_arr.bit_buffer().len(), 8); + assert_eq!(sliced_arr.bit_buffer().offset(), 0); } #[test] diff --git a/vortex-array/src/arrays/bool/patch.rs b/vortex-array/src/arrays/bool/patch.rs index 0d2dfc2eab..ece5597b12 100644 --- a/vortex-array/src/arrays/bool/patch.rs +++ b/vortex-array/src/arrays/bool/patch.rs @@ -19,7 +19,7 @@ impl BoolArray { .clone() .patch(len, offset, indices.as_ref(), values.validity())?; - let (mut own_values, bit_offset) = self.into_boolean_builder(); + let (mut own_values, bit_offset) = self.into_bit_mut(); match_each_integer_ptype!(indices.ptype(), |$I| { for (idx, value) in indices .as_slice::<$I>() @@ -48,7 +48,7 @@ mod tests { fn patch_sliced_bools() { let arr = BoolArray::from(BooleanBuffer::new_set(12)); let sliced = arr.slice(4, 12).unwrap(); - let (values, offset) = sliced.to_bool().unwrap().into_boolean_builder(); + let (values, offset) = sliced.to_bool().unwrap().into_bit_mut(); assert_eq!(offset, 4); assert_eq!(values.len(), 12); assert_eq!(values.as_slice(), &[255, 15]); @@ -58,7 +58,7 @@ mod tests { fn patch_sliced_bools_offset() { let arr = BoolArray::from(BooleanBuffer::new_set(15)); let sliced = arr.slice(4, 15).unwrap(); - let (values, offset) = sliced.to_bool().unwrap().into_boolean_builder(); + let (values, offset) = sliced.to_bool().unwrap().into_bit_mut(); assert_eq!(offset, 4); assert_eq!(values.as_slice(), &[255, 127]); } diff --git a/vortex-array/src/arrays/bool/serde.rs b/vortex-array/src/arrays/bool/serde.rs index db385c6f4e..ddb4a2a1e9 100644 --- a/vortex-array/src/arrays/bool/serde.rs +++ b/vortex-array/src/arrays/bool/serde.rs @@ -21,7 +21,7 @@ impl SerdeVTable for BoolVTable { type Metadata = ProstMetadata; fn metadata(array: &BoolArray) -> VortexResult> { - let bit_offset = array.boolean_buffer().offset(); + let bit_offset = array.bit_buffer().offset(); assert!(bit_offset < 8, "Offset must be <8, got {}", bit_offset); Ok(Some(ProstMetadata(BoolMetadata { offset: u32::try_from(bit_offset).vortex_expect("checked"), @@ -61,7 +61,7 @@ impl SerdeVTable for BoolVTable { impl VisitorVTable for BoolVTable { fn visit_buffers(array: &BoolArray, visitor: &mut dyn ArrayBufferVisitor) { visitor.visit_buffer(&ByteBuffer::from_arrow_buffer( - array.boolean_buffer().clone().into_inner(), + array.bit_buffer().clone().into_inner(), Alignment::none(), )) } diff --git a/vortex-array/src/arrays/chunked/compute/elementwise.rs b/vortex-array/src/arrays/chunked/compute/elementwise.rs index a8fcb7626b..acdc43cf0d 100644 --- a/vortex-array/src/arrays/chunked/compute/elementwise.rs +++ b/vortex-array/src/arrays/chunked/compute/elementwise.rs @@ -99,7 +99,7 @@ mod tests { .to_bool() .unwrap(); assert_eq!( - result.boolean_buffer(), + result.bit_buffer(), &BooleanBuffer::from_iter([true, true, false, false, true]) ); } diff --git a/vortex-array/src/arrays/decimal/compute/between.rs b/vortex-array/src/arrays/decimal/compute/between.rs index afb8fd82fe..f12edc424c 100644 --- a/vortex-array/src/arrays/decimal/compute/between.rs +++ b/vortex-array/src/arrays/decimal/compute/between.rs @@ -167,7 +167,7 @@ mod tests { .unwrap() .into_bool() .unwrap() - .boolean_buffer() + .bit_buffer() .iter() .collect() } diff --git a/vortex-array/src/arrays/primitive/accessor.rs b/vortex-array/src/arrays/primitive/accessor.rs index 693f77d6f0..be02026bbf 100644 --- a/vortex-array/src/arrays/primitive/accessor.rs +++ b/vortex-array/src/arrays/primitive/accessor.rs @@ -25,7 +25,7 @@ impl ArrayAccessor for PrimitiveArray { let mut iter = self .as_slice::() .iter() - .zip(validity.boolean_buffer().iter()) + .zip(validity.bit_buffer().iter()) .map(|(value, valid)| valid.then_some(value)); Ok(f(&mut iter)) } diff --git a/vortex-array/src/arrays/primitive/compute/fill_null.rs b/vortex-array/src/arrays/primitive/compute/fill_null.rs index 0b88151e45..65ff4f87e1 100644 --- a/vortex-array/src/arrays/primitive/compute/fill_null.rs +++ b/vortex-array/src/arrays/primitive/compute/fill_null.rs @@ -30,7 +30,7 @@ impl FillNullKernel for PrimitiveVTable { } Validity::Array(is_valid) => { // TODO(danking): when we take PrimitiveArray by value, we should mutate in-place - let is_invalid = is_valid.to_bool()?.boolean_buffer().not(); + let is_invalid = is_valid.to_bool()?.bit_buffer().not(); match_each_native_ptype!(array.ptype(), |$T| { let mut buffer = BufferMut::copy_from(array.as_slice::<$T>()); let fill_value = fill_value diff --git a/vortex-array/src/arrays/primitive/mod.rs b/vortex-array/src/arrays/primitive/mod.rs index c1843b734f..88a6935c1a 100644 --- a/vortex-array/src/arrays/primitive/mod.rs +++ b/vortex-array/src/arrays/primitive/mod.rs @@ -223,7 +223,7 @@ impl PrimitiveArray { } Validity::Array(val) => { let val = val.to_canonical()?.into_bool()?; - BufferMut::::from_iter(buf_iter.zip(val.boolean_buffer()).map(f)) + BufferMut::::from_iter(buf_iter.zip(val.bit_buffer()).map(f)) } }; Ok(PrimitiveArray::new(buffer.freeze(), validity.clone())) diff --git a/vortex-array/src/arrays/struct_/mod.rs b/vortex-array/src/arrays/struct_/mod.rs index 54a5f09933..2a503b1809 100644 --- a/vortex-array/src/arrays/struct_/mod.rs +++ b/vortex-array/src/arrays/struct_/mod.rs @@ -299,7 +299,7 @@ mod test { assert_eq!( bools .as_::() - .boolean_buffer() + .bit_buffer() .iter() .collect::>(), vec![true, true, true, false, false] diff --git a/vortex-array/src/arrays/varbin/compute/compare.rs b/vortex-array/src/arrays/varbin/compute/compare.rs index 9ba184e1db..99e9d33d68 100644 --- a/vortex-array/src/arrays/varbin/compute/compare.rs +++ b/vortex-array/src/arrays/varbin/compute/compare.rs @@ -142,7 +142,7 @@ mod test { .unwrap(); assert_eq!( - result.boolean_buffer(), + result.bit_buffer(), &BooleanBuffer::from_iter([true, false, false]) ); } diff --git a/vortex-array/src/arrays/varbinview/accessor.rs b/vortex-array/src/arrays/varbinview/accessor.rs index 0f85f11d4f..3ab2399de6 100644 --- a/vortex-array/src/arrays/varbinview/accessor.rs +++ b/vortex-array/src/arrays/varbinview/accessor.rs @@ -38,7 +38,7 @@ impl ArrayAccessor<[u8]> for VarBinViewArray { let validity = v.to_bool()?; let mut iter = views .iter() - .zip(validity.boolean_buffer()) + .zip(validity.bit_buffer()) .map(|(view, valid)| { if valid { if view.is_inlined() { diff --git a/vortex-array/src/arrow/compute/to_arrow/canonical.rs b/vortex-array/src/arrow/compute/to_arrow/canonical.rs index cbbbf393b9..2b7e51e0eb 100644 --- a/vortex-array/src/arrow/compute/to_arrow/canonical.rs +++ b/vortex-array/src/arrow/compute/to_arrow/canonical.rs @@ -165,7 +165,7 @@ fn to_arrow_null(array: NullArray) -> VortexResult { fn to_arrow_bool(array: BoolArray) -> VortexResult { Ok(Arc::new(ArrowBoolArray::new( - array.boolean_buffer().clone(), + array.bit_buffer().clone(), array.validity_mask()?.to_null_buffer(), ))) } diff --git a/vortex-array/src/builders/bool.rs b/vortex-array/src/builders/bool.rs index 4d6e7c4b95..0329075780 100644 --- a/vortex-array/src/builders/bool.rs +++ b/vortex-array/src/builders/bool.rs @@ -80,7 +80,7 @@ impl ArrayBuilder for BoolBuilder { vortex_bail!("Expected Canonical::Bool, found {:?}", array); }; - self.inner.append_buffer(array.boolean_buffer()); + self.inner.append_buffer(array.bit_buffer()); self.nulls.append_validity_mask(array.validity_mask()?); Ok(()) @@ -159,6 +159,6 @@ mod tests { let into_canon = chunk.to_bool().unwrap(); assert_eq!(canon_into.validity(), into_canon.validity()); - assert_eq!(canon_into.boolean_buffer(), into_canon.boolean_buffer()); + assert_eq!(canon_into.bit_buffer(), into_canon.bit_buffer()); } } diff --git a/vortex-array/src/compute/compare.rs b/vortex-array/src/compute/compare.rs index b54c4d2ffb..ab9c6db268 100644 --- a/vortex-array/src/compute/compare.rs +++ b/vortex-array/src/compute/compare.rs @@ -333,7 +333,7 @@ mod tests { use crate::validity::Validity; fn to_int_indices(indices_bits: BoolArray) -> Vec { - let buffer = indices_bits.boolean_buffer(); + let buffer = indices_bits.bit_buffer(); let mask = indices_bits.validity_mask().unwrap(); buffer .iter() @@ -431,7 +431,7 @@ mod tests { fn arrow_compare_different_encodings(#[case] left: ArrayRef, #[case] right: ArrayRef) { let res = compare(&left, &right, Operator::Eq).unwrap(); assert_eq!( - res.to_bool().unwrap().boolean_buffer().count_set_bits(), + res.to_bool().unwrap().bit_buffer().count_set_bits(), left.len() ); } diff --git a/vortex-array/src/compute/filter.rs b/vortex-array/src/compute/filter.rs index 466a82a8b3..0fab53b164 100644 --- a/vortex-array/src/compute/filter.rs +++ b/vortex-array/src/compute/filter.rs @@ -204,9 +204,9 @@ impl TryFrom<&BoolArray> for Mask { // Extract a boolean buffer, treating null values to false let buffer = match array.validity_mask()? { - Mask::AllTrue(_) => array.boolean_buffer().clone(), + Mask::AllTrue(_) => array.bit_buffer().clone(), Mask::AllFalse(_) => return Ok(Self::new_false(array.len())), - Mask::Values(validity) => validity.boolean_buffer().bitand(array.boolean_buffer()), + Mask::Values(validity) => validity.boolean_buffer().bitand(array.bit_buffer()), }; Ok(Self::from_buffer(buffer)) diff --git a/vortex-array/src/compute/list.rs b/vortex-array/src/compute/list.rs index 413c978921..37d54f8110 100644 --- a/vortex-array/src/compute/list.rs +++ b/vortex-array/src/compute/list.rs @@ -42,7 +42,7 @@ use crate::{Array, ArrayRef, IntoArray, ToCanonical}; /// let list_array = ListArray::try_new(elements, offsets, Validity::NonNullable).unwrap(); /// /// let matches = list_contains(list_array.as_ref(), "b".into()).unwrap(); -/// let to_vec: Vec = matches.to_bool().unwrap().boolean_buffer().iter().collect(); +/// let to_vec: Vec = matches.to_bool().unwrap().bit_buffer().iter().collect(); /// assert_eq!(to_vec, vec![false, true, false]); /// ``` pub fn list_contains(array: &dyn Array, value: Scalar) -> VortexResult { @@ -294,8 +294,8 @@ mod tests { let result = list_contains(&list_array, scalar).expect("list_contains failed"); let bool_result = result.to_bool().expect("to_bool failed"); assert_eq!( - bool_result.boolean_buffer().iter().collect_vec(), - expected.boolean_buffer().iter().collect_vec() + bool_result.bit_buffer().iter().collect_vec(), + expected.bit_buffer().iter().collect_vec() ); assert_eq!(bool_result.validity(), expected.validity()); } @@ -318,7 +318,7 @@ mod tests { contains .to_bool() .unwrap() - .boolean_buffer() + .bit_buffer() .iter() .collect_vec(), vec![true, true], diff --git a/vortex-array/src/validity.rs b/vortex-array/src/validity.rs index 7f0abfcd4b..469cd89563 100644 --- a/vortex-array/src/validity.rs +++ b/vortex-array/src/validity.rs @@ -90,7 +90,7 @@ impl Validity { Validity::AllInvalid => false, Validity::Array(array) => { // TODO(ngates): replace with SUM compute function - array.to_bool()?.boolean_buffer().count_set_bits() == array.len() + array.to_bool()?.bit_buffer().count_set_bits() == array.len() } }) } @@ -101,7 +101,7 @@ impl Validity { Validity::AllInvalid => true, Validity::Array(array) => { // TODO(ngates): replace with SUM compute function - array.to_bool()?.boolean_buffer().count_set_bits() == 0 + array.to_bool()?.bit_buffer().count_set_bits() == 0 } }) } @@ -191,7 +191,7 @@ impl Validity { Validity::Array(is_valid) => { let is_valid = is_valid.to_bool()?; let keep_valid = make_invalid.not(); - Validity::from(is_valid.boolean_buffer().bitand(&keep_valid)) + Validity::from(is_valid.bit_buffer().bitand(&keep_valid)) } }), } @@ -235,8 +235,8 @@ impl Validity { let lhs = lhs.to_bool()?; let rhs = rhs.to_bool()?; - let lhs = lhs.boolean_buffer(); - let rhs = rhs.boolean_buffer(); + let lhs = lhs.bit_buffer(); + let rhs = rhs.bit_buffer(); Validity::from(lhs.bitand(rhs)) } @@ -391,7 +391,7 @@ impl PartialEq for Validity { let b = b .to_bool() .vortex_expect("Failed to get Validity Array as BoolArray"); - a.boolean_buffer() == b.boolean_buffer() + a.bit_buffer() == b.bit_buffer() } _ => false, } diff --git a/vortex-buffer/src/bit/arrow.rs b/vortex-buffer/src/bit/arrow.rs new file mode 100644 index 0000000000..4eb584f7e9 --- /dev/null +++ b/vortex-buffer/src/bit/arrow.rs @@ -0,0 +1,59 @@ +//! Conversions between `BitBuffer` and Arrow's `BooleanBuffer`. + +use arrow_buffer::BooleanBuffer; + +use crate::{Alignment, BitBuffer, ByteBuffer}; + +impl From for BitBuffer { + fn from(value: BooleanBuffer) -> Self { + let offset = value.offset(); + let len = value.len(); + let buffer: arrow_buffer::Buffer = value.into_inner(); + let buffer = ByteBuffer::from_arrow_buffer(buffer, Alignment::of::()); + + BitBuffer::new_with_offset(buffer, len, offset) + } +} + +impl From for BooleanBuffer { + fn from(value: BitBuffer) -> Self { + let offset = value.offset(); + let len = value.len(); + let buffer = value.into_inner(); + + BooleanBuffer::new(buffer.into_arrow_buffer(), offset, len) + } +} + +#[cfg(test)] +mod tests { + use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder}; + + use crate::BitBuffer; + + #[test] + fn test_from_arrow() { + let mut arrow_bools = BooleanBufferBuilder::new(10); + arrow_bools.append_n(5, true); + arrow_bools.append_n(5, false); + let bit_buffer: BitBuffer = arrow_bools.finish().into(); + + for i in 0..5 { + assert!(bit_buffer.value(i)); + } + + for i in 5..10 { + assert!(!bit_buffer.value(i)); + } + + // Convert back to Arrow + let arrow_bools: BooleanBuffer = bit_buffer.into(); + + for i in 0..5 { + assert!(arrow_bools.value(i)); + } + for i in 5..10 { + assert!(!arrow_bools.value(i)); + } + } +} diff --git a/vortex-buffer/src/bit/bit_iter.rs b/vortex-buffer/src/bit/bit_iter.rs new file mode 100644 index 0000000000..2547eb9d3f --- /dev/null +++ b/vortex-buffer/src/bit/bit_iter.rs @@ -0,0 +1,345 @@ +use std::iter::TrustedLen; + +use vortex_error::VortexExpect; + +use crate::{Buffer, BufferIterator, ByteBuffer, get_bit_unchecked}; + +#[inline] +fn read_u64(input: &[u8]) -> u64 { + let len = input.len().min(8); + let mut buf = [0u8; 8]; + buf[..len].copy_from_slice(input); + u64::from_le_bytes(buf) +} + +#[inline] +fn compute_prefix_mask(lead_padding: usize) -> u64 { + !((1 << lead_padding) - 1) +} + +#[inline] +fn compute_suffix_mask(len: usize, lead_padding: usize) -> (u64, usize) { + let trailing_bits = (len + lead_padding) % 64; + + if trailing_bits == 0 { + return (u64::MAX, 0); + } + + let trailing_padding = 64 - trailing_bits; + let suffix_mask = (1 << trailing_bits) - 1; + (suffix_mask, trailing_padding) +} + +pub struct UnalignedBitChunks { + lead_padding: usize, + trailing_padding: usize, + prefix: Option, + chunks: Buffer, + suffix: Option, +} + +impl UnalignedBitChunks { + pub fn new(buffer: ByteBuffer, offset: usize, len: usize) -> Self { + assert!( + offset < 8, + "BooleanBuffer offset must be always less than 8" + ); + if len == 0 { + return Self { + lead_padding: 0, + trailing_padding: 0, + prefix: None, + chunks: Buffer::empty(), + suffix: None, + }; + } + let byte_offset = offset / 8; + let offset_padding = offset % 8; + let bytes_len = (len + offset_padding).div_ceil(8); + + let buffer = buffer.slice(byte_offset..byte_offset + bytes_len); + + let prefix_mask = compute_prefix_mask(offset_padding); + + // If less than 8 bytes, read into prefix + if buffer.len() <= 8 { + let (suffix_mask, trailing_padding) = compute_suffix_mask(len, offset_padding); + let prefix = read_u64(&buffer) & suffix_mask & prefix_mask; + + return Self { + lead_padding: offset_padding, + trailing_padding, + prefix: Some(prefix), + chunks: Buffer::empty(), + suffix: None, + }; + } + + // If less than 16 bytes, read into prefix and suffix + if buffer.len() <= 16 { + let (suffix_mask, trailing_padding) = compute_suffix_mask(len, offset_padding); + let prefix = read_u64(&buffer[..8]) & prefix_mask; + let suffix = read_u64(&buffer[8..]) & suffix_mask; + + return Self { + lead_padding: offset_padding, + trailing_padding, + prefix: Some(prefix), + chunks: Buffer::empty(), + suffix: Some(suffix), + }; + } + + let (prefix, mut chunks, suffix) = buffer.align_to::(); + assert!( + prefix.len() < 8 && suffix.len() < 8, + "align_to did not return largest possible aligned slice" + ); + let (alignment_padding, prefix) = match (offset_padding, prefix.is_empty()) { + (0, true) => (0, None), + (_, true) => { + let prefix = chunks[0] & prefix_mask; + chunks = chunks.slice(1..); + (0, Some(prefix)) + } + (_, false) => { + let alignment_padding = (8 - prefix.len()) * 8; + + let prefix = (read_u64(&prefix) & prefix_mask) << alignment_padding; + (alignment_padding, Some(prefix)) + } + }; + + let lead_padding = offset_padding + alignment_padding; + let (suffix_mask, trailing_padding) = compute_suffix_mask(len, lead_padding); + + let suffix = match (trailing_padding, suffix.is_empty()) { + (0, _) => None, + (_, true) => { + let suffix = chunks[chunks.len() - 1] & suffix_mask; + chunks = chunks.slice(..chunks.len() - 1); + Some(suffix) + } + (_, false) => Some(read_u64(&suffix) & suffix_mask), + }; + + Self { + lead_padding, + trailing_padding, + prefix, + chunks, + suffix, + } + } + + pub fn iter(&self) -> UnalignedBitChunkIterator { + self.prefix + .into_iter() + .chain(self.chunks.clone()) + .chain(self.suffix) + } + + pub fn lead_padding(&self) -> usize { + self.lead_padding + } + + pub fn trailing_padding(&self) -> usize { + self.trailing_padding + } + + pub fn count_ones(&self) -> usize { + self.iter().map(|x| x.count_ones() as usize).sum() + } +} + +pub type UnalignedBitChunkIterator = core::iter::Chain< + core::iter::Chain, BufferIterator>, + core::option::IntoIter, +>; + +/// Iterator over bits in the byte buffer +pub struct BitIterator { + buffer: ByteBuffer, + current_offset: usize, + end_offset: usize, +} + +impl BitIterator { + pub fn new(buffer: ByteBuffer, offset: usize, len: usize) -> Self { + let end_offset = offset + len; + assert!( + buffer.len() >= end_offset.div_ceil(8), + "Buffer {} too small for requested offset and len {}", + buffer.len(), + end_offset.div_ceil(8) + ); + + Self { + buffer, + current_offset: offset, + end_offset, + } + } +} + +impl Iterator for BitIterator { + type Item = bool; + + fn next(&mut self) -> Option { + if self.current_offset == self.end_offset { + return None; + } + // SAFETY: current_offset is in bounds + let v = unsafe { get_bit_unchecked(&self.buffer, self.current_offset) }; + self.current_offset += 1; + Some(v) + } + + fn size_hint(&self) -> (usize, Option) { + let remaining_bits = self.end_offset - self.current_offset; + (remaining_bits, Some(remaining_bits)) + } +} + +unsafe impl TrustedLen for BitIterator {} + +impl ExactSizeIterator for BitIterator {} + +impl DoubleEndedIterator for BitIterator { + fn next_back(&mut self) -> Option { + if self.current_offset == self.end_offset { + return None; + } + self.end_offset -= 1; + // Safety: end_offset is in bounds + Some(unsafe { get_bit_unchecked(&self.buffer, self.end_offset) }) + } +} + +pub struct BitIndexIterator { + current_chunk: u64, + chunk_offset: i64, + iter: UnalignedBitChunkIterator, +} + +impl BitIndexIterator { + pub fn new(buffer: ByteBuffer, offset: usize, len: usize) -> Self { + let chunks = UnalignedBitChunks::new(buffer, offset, len); + let mut iter = chunks.iter(); + let current_chunk = iter.next().unwrap_or(0); + let chunk_offset = -(chunks.lead_padding() as i64); + + Self { + current_chunk, + chunk_offset, + iter, + } + } +} + +impl Iterator for BitIndexIterator { + type Item = usize; + + fn next(&mut self) -> Option { + loop { + if self.current_chunk != 0 { + let bit_pos = self.current_chunk.trailing_zeros(); + self.current_chunk ^= 1 << bit_pos; + return Some( + usize::try_from(self.chunk_offset + bit_pos as i64) + .vortex_expect("bit index must be a usize"), + ); + } + + self.current_chunk = self.iter.next()?; + self.chunk_offset += 64; + } + } +} + +pub struct BitSliceIterator { + iter: UnalignedBitChunkIterator, + len: usize, + current_offset: i64, + current_chunk: u64, +} + +impl BitSliceIterator { + pub fn new(buffer: ByteBuffer, offset: usize, len: usize) -> Self { + let chunks = UnalignedBitChunks::new(buffer, offset, len); + let mut iter = chunks.iter(); + let current_chunk = iter.next().unwrap_or(0); + let current_offset = -(chunks.lead_padding() as i64); + + Self { + iter, + len, + current_offset, + current_chunk, + } + } + + /// Returns `Some((chunk_offset, bit_offset))` for the next chunk that has at + /// least one bit set, or None if there is no such chunk. + /// + /// Where `chunk_offset` is the bit offset to the current `u64` chunk + /// and `bit_offset` is the offset of the first `1` bit in that chunk + fn advance_to_set_bit(&mut self) -> Option<(i64, u32)> { + loop { + if self.current_chunk != 0 { + // Find the index of the first 1 + let bit_pos = self.current_chunk.trailing_zeros(); + return Some((self.current_offset, bit_pos)); + } + + self.current_chunk = self.iter.next()?; + self.current_offset += 64; + } + } +} + +impl Iterator for BitSliceIterator { + type Item = (usize, usize); + + fn next(&mut self) -> Option { + if self.len == 0 { + return None; + } + + let (start_chunk, start_bit) = self.advance_to_set_bit()?; + + // Set bits up to start + self.current_chunk |= (1 << start_bit) - 1; + + loop { + if self.current_chunk != u64::MAX { + // Find the index of the first 0 + let end_bit = self.current_chunk.trailing_ones(); + + // Zero out up to end_bit + self.current_chunk &= !((1 << end_bit) - 1); + + return Some(( + usize::try_from(start_chunk + start_bit as i64) + .vortex_expect("bit offset must be a usize"), + usize::try_from(self.current_offset + end_bit as i64) + .vortex_expect("bit offset must be a usize"), + )); + } + + match self.iter.next() { + Some(next) => { + self.current_chunk = next; + self.current_offset += 64; + } + None => { + return Some(( + usize::try_from(start_chunk + start_bit as i64) + .vortex_expect("bit offset must be a usize"), + std::mem::take(&mut self.len), + )); + } + } + } + } +} diff --git a/vortex-buffer/src/bit/buf.rs b/vortex-buffer/src/bit/buf.rs new file mode 100644 index 0000000000..4a8eecb1d7 --- /dev/null +++ b/vortex-buffer/src/bit/buf.rs @@ -0,0 +1,217 @@ +use crate::bit::bit_iter::{BitIndexIterator, BitIterator, BitSliceIterator, UnalignedBitChunks}; +use crate::{Buffer, ByteBuffer, buffer, get_bit}; + +/// An immutable bitset stored as a packed byte buffer. +#[derive(Clone, Debug)] +pub struct BitBuffer { + buffer: ByteBuffer, + len: usize, + offset: usize, +} + +impl BitBuffer { + /// Create a new `BoolBuffer` backed by a [`ByteBuffer`] with `len` bits in view. + /// + /// Panics if the buffer is not large enough to hold `len` bits. + pub fn new(buffer: ByteBuffer, len: usize) -> Self { + assert!( + buffer.len() * 8 >= len, + "provided ByteBuffer not large enough to back BoolBuffer with len {len}" + ); + + Self { + buffer, + len, + offset: 0, + } + } + + /// Create a new `BoolBuffer` backed by a [`ByteBuffer`] with `len` bits in view, starting at the + /// given `offset` (in bits). + /// + /// Panics if the buffer is not large enough to hold `len` bits or if the offset is greater than + pub fn new_with_offset(buffer: ByteBuffer, len: usize, offset: usize) -> Self { + assert!( + offset < len, + "cannot construct new BoolBuffer with offset {offset} len {len}" + ); + assert!( + buffer.len() * 8 >= len - offset, + "provided ByteBuffer not large enough to back BoolBuffer with offset {offset} len {len}" + ); + + Self { + buffer, + len, + offset, + } + } + + /// Create a new `BoolBuffer` of length `len` where all bits are set (true). + pub fn new_set(len: usize) -> Self { + let words = len.div_ceil(8); + let buffer = buffer![0xFF; words]; + + Self { + buffer, + len, + offset: 0, + } + } + + /// Create a new `BoolBuffer` of length `len` where all bits are unset (false). + pub fn new_unset(len: usize) -> Self { + let words = len.div_ceil(8); + let buffer = Buffer::zeroed(words); + + Self { + buffer, + len, + offset: 0, + } + } + + /// Get the logical length of this `BoolBuffer`. + /// + /// This may differ from the physical length of the backing buffer, for example if it was + /// created using the `new_with_offset` constructor, or if it was sliced. + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns `true` if the `BoolBuffer` is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Offset of the start of the buffer in bits. + #[inline] + pub fn offset(&self) -> usize { + self.offset + } + + /// Retrieve the value at the given index. + /// + /// Panics if the index is out of bounds. + pub fn value(&self, index: usize) -> bool { + get_bit(&self.buffer, index + self.offset) + } + + /// Create a new zero-copy slice of this BoolBuffer that begins at the `start` index and extends + /// for `len` bits. + /// + /// Panics if the slice would extend beyond the end of the buffer. + pub fn slice(&self, start: usize, len: usize) -> Self { + assert!( + start + len <= self.len, + "slice of len {len} starting at {start} exceeds len {}", + self.len + ); + + Self::new_with_offset(self.buffer.clone(), len, self.offset + start) + } + + /// Slice any full bytes from the buffer, leaving the offset < 8. + pub fn shrink_offset(self) -> Self { + let byte_offset = self.offset / 8; + let bit_offset = self.offset % 8; + let len = self.len(); + let buffer = self + .into_inner() + .slice(byte_offset..(len + bit_offset).div_ceil(8)); + BitBuffer::new_with_offset(buffer, bit_offset, len) + } + + /// Access chunks of the buffer aligned to 8 byte boundary as [prefix, , suffix] + pub fn chunks(&self) -> UnalignedBitChunks { + UnalignedBitChunks::new(self.buffer.clone(), self.offset, self.len) + } + + /// Get the number of set bits in the buffer. + pub fn true_count(&self) -> usize { + self.chunks().count_ones() + } + + /// Get the number of unset bits in the buffer. + pub fn false_count(&self) -> usize { + self.len - self.true_count() + } + + /// Iterator over bits in the buffer + pub fn iter(&self) -> BitIterator { + BitIterator::new(self.buffer.clone(), self.offset, self.len) + } + + /// Iterator over set indices of the underlying buffer + pub fn set_indices(&self) -> BitIndexIterator { + BitIndexIterator::new(self.buffer.clone(), self.offset, self.len) + } + + /// Iterator over set slices of the underlying buffer + pub fn set_slices(&self) -> BitSliceIterator { + BitSliceIterator::new(self.buffer.clone(), self.offset, self.len) + } +} + +// Conversions + +impl BitBuffer { + /// Consumes this `BoolBuffer` and returns the backing `Buffer` with any offset + /// and length information applied. + pub fn into_inner(self) -> ByteBuffer { + let word_start = self.offset / 8; + let word_end = (self.offset + self.len).div_ceil(8); + + self.buffer.slice(word_start..word_end) + } +} + +#[cfg(test)] +mod tests { + use crate::bit::BitBuffer; + use crate::{ByteBuffer, buffer}; + + #[test] + fn test_bool() { + // Create a new Buffer of length 1024 where the 8th bit is set. + let buffer: ByteBuffer = buffer![1 << 7; 1024]; + let bools = BitBuffer::new(buffer, 1024 * 8); + + // sanity checks + assert_eq!(bools.len(), 1024 * 8); + assert!(!bools.is_empty()); + assert_eq!(bools.true_count(), 1024); + assert_eq!(bools.false_count(), 1024 * 7); + + // Check all of the values + for word in 0..1024 { + for bit in 0..8 { + if bit == 7 { + assert!(bools.value(word * 8 + bit)); + } else { + assert!(!bools.value(word * 8 + bit)); + } + } + } + + // Slice the buffer to create a new subset view. + let sliced = bools.slice(64, 8); + + // sanity checks + assert_eq!(sliced.len(), 8); + assert!(!sliced.is_empty()); + assert_eq!(sliced.true_count(), 1); + assert_eq!(sliced.false_count(), 7); + + // Check all of the values like before + for bit in 0..8 { + if bit == 7 { + assert!(sliced.value(bit)); + } else { + assert!(!sliced.value(bit)); + } + } + } +} diff --git a/vortex-buffer/src/bit/buf_mut.rs b/vortex-buffer/src/bit/buf_mut.rs new file mode 100644 index 0000000000..b1f4e3916a --- /dev/null +++ b/vortex-buffer/src/bit/buf_mut.rs @@ -0,0 +1,287 @@ +use vortex_error::VortexExpect; + +use crate::{BitBuffer, BufferMut, ByteBufferMut, buffer_mut}; + +/// A mutable bitset buffer that allows random access to individual bits for set and get. +/// +/// +/// # Example +/// ``` +/// use vortex_buffer::BitBufferMut; +/// +/// let mut bools = BitBufferMut::new_unset(10); +/// bools.set_to(9, true); +/// for i in 0..9 { +/// assert!(!bools.value(i)); +/// } +/// assert!(bools.value(9)); +/// +/// // Freeze into a new bools vector. +/// let bools = bools.freeze(); +/// ``` +/// +/// See also: [`crate::BitBuffer`]. +pub struct BitBufferMut { + buffer: ByteBufferMut, + len: usize, + capacity: usize, +} + +impl BitBufferMut { + /// Create a new empty mutable bit buffer with requested capacity (in bits). + pub fn new(capacity: usize) -> Self { + Self { + buffer: BufferMut::with_capacity(capacity.div_ceil(8)), + len: 0, + capacity, + } + } + + /// Create a new mutable buffer with requested `len` and all bits set to `true`. + pub fn new_set(len: usize) -> Self { + Self { + buffer: buffer_mut![0xFF; len.div_ceil(8)], + capacity: len, + len, + } + } + + /// Create a new mutable buffer with requested `len` and all bits set to `false`. + pub fn new_unset(len: usize) -> Self { + Self { + buffer: BufferMut::zeroed(len.div_ceil(8)), + capacity: len, + len, + } + } + + /// Get the current populated length of the buffer. + pub fn len(&self) -> usize { + self.len + } + + /// True if the buffer has length 0. + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Get the value at the requested index. + pub fn value(&self, index: usize) -> bool { + assert!(index < self.len, "index {index} exceeds len {}", self.len); + + let word = self.buffer[index / 8]; + let bit = word & (1 << (index % 8)); + + bit != 0 + } + + /// Set the bit at `index` to the given boolean value. + /// + /// This operation is checked so if `index` exceeds the buffer length, this will panic. + pub fn set_to(&mut self, index: usize, value: bool) { + if value { + self.set(index); + } else { + self.unset(index); + } + } + + /// Set a position to `true`. + /// + /// This operation is checked so if `index` exceeds the buffer length, this will panic. + pub fn set(&mut self, index: usize) { + assert!(index < self.len, "index {index} exceeds len {}", self.len); + + // SAFETY: checked by assertion + unsafe { self.set_unchecked(index) }; + } + + /// Set a position to `false`. + /// + /// This operation is checked so if `index` exceeds the buffer length, this will panic. + pub fn unset(&mut self, index: usize) { + assert!(index < self.len, "index {index} exceeds len {}", self.len); + + // SAFETY: checked by assertion + unsafe { self.unset_unchecked(index) }; + } + + /// Set the bit at `index` to `true` without checking bounds. + /// + /// # Safety + /// + /// The caller must ensure that `index` does not exceed the largest bit index in the backing buffer. + pub unsafe fn set_unchecked(&mut self, index: usize) { + let word_index = index / 8; + let bit_index = index % 8; + // SAFETY: checked by caller + unsafe { + let word = self.buffer.as_mut_ptr().add(word_index); + word.write(*word | 1 << bit_index); + } + } + + /// Unset the bit at `index` without checking bounds. + /// + /// # Safety + /// + /// The caller must ensure that `index` does not exceed the largest bit index in the backing buffer. + pub unsafe fn unset_unchecked(&mut self, index: usize) { + let word_index = index / 8; + let bit_index = index % 8; + + // SAFETY: checked by caller + unsafe { + let word = self.buffer.as_mut_ptr().add(word_index); + word.write(*word & !(1 << bit_index)); + } + } + + /// Append a new boolean into the bit buffer, incrementing the length. + /// + /// Panics if the buffer is full. + pub fn append(&mut self, value: bool) { + if value { + self.append_true() + } else { + self.append_false() + } + } + + /// Append a new true value to the buffer. + /// + /// Panics if there is no remaining capacity. + pub fn append_true(&mut self) { + assert!( + self.len < self.capacity, + "cannot append to full BitBufferMut" + ); + + if self.len % 8 == 0 { + // Push a new word that starts with 1 + self.buffer.push(1u8); + } else { + // Push a 1 bit into the current word. + let word = self.buffer.last_mut().vortex_expect("buffer is not empty"); + *word |= 1 << (self.len % 8); + } + + self.len += 1; + } + + /// Append a new false value to the buffer. + /// + /// Panics if there is no remaining capacity. + pub fn append_false(&mut self) { + assert!( + self.len < self.capacity, + "cannot append to full BitBufferMut" + ); + + if self.len % 8 == 0 { + // push new word that starts with 0 + self.buffer.push(0u8); + } + + self.len += 1; + } + /// Append several boolean values into the bit buffer. After this operation, + /// the length will be incremented by `n`. + /// + /// Panics if the buffer does not have `n` slots left. + pub fn append_n(&mut self, value: bool, n: usize) { + // Implementation is largely borrowed from arrow::BooleanBufferBuilder::append_n + assert!( + self.len.saturating_add(n) <= self.capacity, + "cannot append {n} entries to BitBufferMut with len {} capacity {}", + self.len, + self.capacity + ); + + match value { + true => { + let new_len = self.len + n; + let new_len_bytes = new_len.div_ceil(8); + let cur_remainder = self.len % 8; + let new_remainder = new_len % 8; + + if cur_remainder != 0 { + // Pad cur_remainder high bits with 1s + *self + .buffer + .as_mut_slice() + .last_mut() + .vortex_expect("buffer is not empty") |= !((1 << cur_remainder) - 1); + } + + // Push several full bytes. + if new_len_bytes > self.buffer.len() { + // Push full bytes, except for the final byte. + self.buffer.push_n(0xFF, new_len_bytes - self.buffer.len()); + } + + // Patch zeros into remainder of last byte pushed + if new_remainder > 0 { + // Set the new_remainder LSB to 1 + *self + .buffer + .as_mut_slice() + .last_mut() + .vortex_expect("buffer is not empty") &= (1 << new_remainder) - 1; + } + } + false => { + let new_len = self.len + n; + let new_len_bytes = new_len.div_ceil(8); + + // push new 0 bytes. + if new_len_bytes > self.buffer.len() { + self.buffer.push_n(0, new_len_bytes - self.buffer.len()); + } + } + } + + self.len += n; + } + + /// Freeze the buffer in its current state into an immutable `BoolBuffer`. + pub fn freeze(self) -> BitBuffer { + BitBuffer::new(self.buffer.freeze().into_byte_buffer(), self.len) + } +} + +#[cfg(test)] +mod tests { + use crate::bit::buf_mut::BitBufferMut; + + #[test] + fn test_bits_mut() { + let mut bools = BitBufferMut::new_unset(10); + bools.set_to(0, true); + bools.set_to(9, true); + + let bools = bools.freeze(); + assert!(bools.value(0)); + for i in 1..=8 { + assert!(!bools.value(i)); + } + assert!(bools.value(9)); + } + + #[test] + fn test_append_n() { + let mut bools = BitBufferMut::new(10); + assert_eq!(bools.len(), 0); + assert!(bools.is_empty()); + + bools.append(true); + bools.append_n(false, 8); + bools.append_n(true, 1); + + let bools = bools.freeze(); + + assert_eq!(bools.true_count(), 2); + assert!(bools.value(0)); + assert!(bools.value(9)); + } +} diff --git a/vortex-buffer/src/bit/mod.rs b/vortex-buffer/src/bit/mod.rs new file mode 100644 index 0000000000..42f254614b --- /dev/null +++ b/vortex-buffer/src/bit/mod.rs @@ -0,0 +1,29 @@ +//! Packed bitmaps that can be used to store boolean values. +//! +//! This module provides a wrapper on top of the `Buffer` type to store mutable and immutable +//! bitsets. The bitsets are stored in little-endian order, meaning that the least significant bit +//! of the first byte is the first bit in the bitset. +#[cfg(feature = "arrow")] +mod arrow; +mod bit_iter; +mod buf; +mod buf_mut; + +pub use buf::*; +pub use buf_mut::*; + +/// Get bit value at `index` out of `buf` +#[inline] +pub fn get_bit(buf: &[u8], index: usize) -> bool { + buf[index / 8] & (1 << (index % 8)) != 0 +} + +/// Get bit value at `index` out of `buf` without bounds checking +/// +/// # Safety +/// `index` must be between 0 and length of `buf` +#[inline] +pub unsafe fn get_bit_unchecked(buf: &[u8], index: usize) -> bool { + let byte = unsafe { buf.get_unchecked(index / 8) }; + byte & (1 << (index % 8)) != 0 +} diff --git a/vortex-buffer/src/buffer.rs b/vortex-buffer/src/buffer.rs index a6a0a2172f..e457444fb3 100644 --- a/vortex-buffer/src/buffer.rs +++ b/vortex-buffer/src/buffer.rs @@ -174,7 +174,7 @@ impl Buffer { } /// Returns an iterator over the buffer of elements of type T. - pub fn iter(&self) -> impl Iterator + '_ { + pub fn iter(&self) -> std::slice::Iter { self.as_slice().iter() } @@ -395,6 +395,65 @@ impl Buffer { vortex_panic!("Buffer is not aligned to requested alignment {}", alignment) } } + + /// Align the buffer to alignment of U + pub fn align_to(mut self) -> (Buffer, Buffer, Buffer) { + let offset = self.as_ptr().align_offset(align_of::()); + if offset > self.len() { + ( + self, + Buffer::empty_aligned(Alignment::of::()), + Buffer::empty_aligned(Alignment::of::()), + ) + } else { + let left = self.bytes.split_to(offset); + let (us_len, _) = self.align_to_offsets::(); + let trailer = self.bytes.split_off(us_len); + ( + Buffer::from_bytes_aligned(left, Alignment::of::()), + Buffer::from_bytes_aligned(self.bytes, Alignment::of::()), + Buffer::from_bytes_aligned(trailer, Alignment::of::()), + ) + } + } + + /// Adapted from standard library slice::align_to_offsets + /// Function to calculate lengths of the middle and trailing slice for `align_to`. + fn align_to_offsets(&self) -> (usize, usize) { + // What we gonna do about `rest` is figure out what multiple of `U`s we can put in a + // lowest number of `T`s. And how many `T`s we need for each such "multiple". + // + // Consider for example T=u8 U=u16. Then we can put 1 U in 2 Ts. Simple. Now, consider + // for example a case where size_of:: = 16, size_of:: = 24. We can put 2 Us in + // place of every 3 Ts in the `rest` slice. A bit more complicated. + // + // Formula to calculate this is: + // + // Us = lcm(size_of::, size_of::) / size_of:: + // Ts = lcm(size_of::, size_of::) / size_of:: + // + // Expanded and simplified: + // + // Us = size_of:: / gcd(size_of::, size_of::) + // Ts = size_of:: / gcd(size_of::, size_of::) + // + // Luckily since all this is constant-evaluated... performance here matters not! + const fn gcd(a: usize, b: usize) -> usize { + if b == 0 { a } else { gcd(b, a % b) } + } + + // Explicitly wrap the function call in a const block so it gets + // constant-evaluated even in debug mode. + let gcd: usize = const { gcd(size_of::(), size_of::()) }; + let ts: usize = size_of::() / gcd; + let us: usize = size_of::() / gcd; + + // Armed with this knowledge, we can find how many `U`s we can fit! + let us_len = self.len() / ts * us; + // And how many `T`s will be in the trailing slice! + let ts_len = self.len() % ts; + (us_len, ts_len) + } } impl Debug for Buffer { diff --git a/vortex-buffer/src/lib.rs b/vortex-buffer/src/lib.rs index 0dbcacf50d..ae435db8a4 100644 --- a/vortex-buffer/src/lib.rs +++ b/vortex-buffer/src/lib.rs @@ -15,6 +15,7 @@ //! compile-time alignment of `A`. //! * `buffer!` and `buffer_mut!` macros with the same syntax as the builtin `vec!` macro for //! inline construction of buffers. +//! * `BitBuffer` and `BitBufferMut` provide packed bitsets that can be used to store boolean values. //! //! You can think of `BufferMut` as similar to a `Vec`, except that any operation that may //! cause a re-allocation, e.g. extend, will ensure the new allocation maintains the buffer's @@ -45,6 +46,7 @@ //! `arrow_buffer::OffsetBuffer`. pub use alignment::*; +pub use bit::*; pub use buffer::*; pub use buffer_mut::*; pub use bytes::*; @@ -54,6 +56,7 @@ pub use string::*; mod alignment; #[cfg(feature = "arrow")] mod arrow; +mod bit; mod buffer; mod buffer_mut; mod bytes; diff --git a/vortex-expr/src/binary.rs b/vortex-expr/src/binary.rs index ff70f4c9c2..108724dbbd 100644 --- a/vortex-expr/src/binary.rs +++ b/vortex-expr/src/binary.rs @@ -140,8 +140,8 @@ impl PartialEq for BinaryExpr { /// let result = eq(ident(), lit(3)).evaluate(xs.as_ref()).unwrap(); /// /// assert_eq!( -/// result.to_bool().unwrap().boolean_buffer(), -/// BoolArray::from_iter(vec![false, false, true]).boolean_buffer(), +/// result.to_bool().unwrap().bit_buffer(), +/// BoolArray::from_iter(vec![false, false, true]).bit_buffer(), /// ); /// ``` pub fn eq(lhs: ExprRef, rhs: ExprRef) -> ExprRef { @@ -163,8 +163,8 @@ pub fn eq(lhs: ExprRef, rhs: ExprRef) -> ExprRef { /// let result = not_eq(ident(), lit(3)).evaluate(xs.as_ref()).unwrap(); /// /// assert_eq!( -/// result.to_bool().unwrap().boolean_buffer(), -/// BoolArray::from_iter(vec![true, true, false]).boolean_buffer(), +/// result.to_bool().unwrap().bit_buffer(), +/// BoolArray::from_iter(vec![true, true, false]).bit_buffer(), /// ); /// ``` pub fn not_eq(lhs: ExprRef, rhs: ExprRef) -> ExprRef { @@ -186,8 +186,8 @@ pub fn not_eq(lhs: ExprRef, rhs: ExprRef) -> ExprRef { /// let result = gt_eq(ident(), lit(3)).evaluate(xs.as_ref()).unwrap(); /// /// assert_eq!( -/// result.to_bool().unwrap().boolean_buffer(), -/// BoolArray::from_iter(vec![false, false, true]).boolean_buffer(), +/// result.to_bool().unwrap().bit_buffer(), +/// BoolArray::from_iter(vec![false, false, true]).bit_buffer(), /// ); /// ``` pub fn gt_eq(lhs: ExprRef, rhs: ExprRef) -> ExprRef { @@ -209,8 +209,8 @@ pub fn gt_eq(lhs: ExprRef, rhs: ExprRef) -> ExprRef { /// let result = gt(ident(), lit(2)).evaluate(xs.as_ref()).unwrap(); /// /// assert_eq!( -/// result.to_bool().unwrap().boolean_buffer(), -/// BoolArray::from_iter(vec![false, false, true]).boolean_buffer(), +/// result.to_bool().unwrap().bit_buffer(), +/// BoolArray::from_iter(vec![false, false, true]).bit_buffer(), /// ); /// ``` pub fn gt(lhs: ExprRef, rhs: ExprRef) -> ExprRef { @@ -232,8 +232,8 @@ pub fn gt(lhs: ExprRef, rhs: ExprRef) -> ExprRef { /// let result = lt_eq(ident(), lit(2)).evaluate(xs.as_ref()).unwrap(); /// /// assert_eq!( -/// result.to_bool().unwrap().boolean_buffer(), -/// BoolArray::from_iter(vec![true, true, false]).boolean_buffer(), +/// result.to_bool().unwrap().bit_buffer(), +/// BoolArray::from_iter(vec![true, true, false]).bit_buffer(), /// ); /// ``` pub fn lt_eq(lhs: ExprRef, rhs: ExprRef) -> ExprRef { @@ -255,8 +255,8 @@ pub fn lt_eq(lhs: ExprRef, rhs: ExprRef) -> ExprRef { /// let result = lt(ident(), lit(3)).evaluate(xs.as_ref()).unwrap(); /// /// assert_eq!( -/// result.to_bool().unwrap().boolean_buffer(), -/// BoolArray::from_iter(vec![true, true, false]).boolean_buffer(), +/// result.to_bool().unwrap().bit_buffer(), +/// BoolArray::from_iter(vec![true, true, false]).bit_buffer(), /// ); /// ``` pub fn lt(lhs: ExprRef, rhs: ExprRef) -> ExprRef { @@ -276,8 +276,8 @@ pub fn lt(lhs: ExprRef, rhs: ExprRef) -> ExprRef { /// let result = or(ident(), lit(false)).evaluate(xs.as_ref()).unwrap(); /// /// assert_eq!( -/// result.to_bool().unwrap().boolean_buffer(), -/// BoolArray::from_iter(vec![true, false, true]).boolean_buffer(), +/// result.to_bool().unwrap().bit_buffer(), +/// BoolArray::from_iter(vec![true, false, true]).bit_buffer(), /// ); /// ``` pub fn or(lhs: ExprRef, rhs: ExprRef) -> ExprRef { @@ -297,8 +297,8 @@ pub fn or(lhs: ExprRef, rhs: ExprRef) -> ExprRef { /// let result = and(ident(), lit(true)).evaluate(xs.as_ref()).unwrap(); /// /// assert_eq!( -/// result.to_bool().unwrap().boolean_buffer(), -/// BoolArray::from_iter(vec![true, false, true]).boolean_buffer(), +/// result.to_bool().unwrap().bit_buffer(), +/// BoolArray::from_iter(vec![true, false, true]).bit_buffer(), /// ); /// ``` pub fn and(lhs: ExprRef, rhs: ExprRef) -> ExprRef { diff --git a/vortex-expr/src/like.rs b/vortex-expr/src/like.rs index 44cf859c9d..92dcd70864 100644 --- a/vortex-expr/src/like.rs +++ b/vortex-expr/src/like.rs @@ -170,7 +170,7 @@ mod tests { .unwrap() .to_bool() .unwrap() - .boolean_buffer() + .bit_buffer() .iter() .collect::>(), vec![true, false, true, true, false, false] diff --git a/vortex-expr/src/not.rs b/vortex-expr/src/not.rs index cd567e4c3a..0dfc736e59 100644 --- a/vortex-expr/src/not.rs +++ b/vortex-expr/src/not.rs @@ -120,7 +120,7 @@ mod tests { .unwrap() .to_bool() .unwrap() - .boolean_buffer() + .bit_buffer() .iter() .collect::>(), vec![true, false, true, true, false, false] diff --git a/vortex-layout/src/layouts/flat/reader.rs b/vortex-layout/src/layouts/flat/reader.rs index 370bb90c81..8ecbbd6126 100644 --- a/vortex-layout/src/layouts/flat/reader.rs +++ b/vortex-layout/src/layouts/flat/reader.rs @@ -295,7 +295,7 @@ mod test { assert_eq!( &BooleanBuffer::from_iter([false, false, false, true, true]), - result.boolean_buffer() + result.bit_buffer() ); }) } diff --git a/vortex-layout/src/layouts/struct_/reader.rs b/vortex-layout/src/layouts/struct_/reader.rs index 9076597527..44fa65da80 100644 --- a/vortex-layout/src/layouts/struct_/reader.rs +++ b/vortex-layout/src/layouts/struct_/reader.rs @@ -401,7 +401,7 @@ mod tests { result .to_bool() .unwrap() - .boolean_buffer() + .bit_buffer() .iter() .collect::>() ); @@ -432,7 +432,7 @@ mod tests { result .to_bool() .unwrap() - .boolean_buffer() + .bit_buffer() .iter() .collect::>() ); diff --git a/vortex-layout/src/layouts/zoned/zone_map.rs b/vortex-layout/src/layouts/zoned/zone_map.rs index 08b57b272b..aa73eeea1c 100644 --- a/vortex-layout/src/layouts/zoned/zone_map.rs +++ b/vortex-layout/src/layouts/zoned/zone_map.rs @@ -246,14 +246,14 @@ mod tests { stats_table.array.fields()[1] .to_bool() .vortex_unwrap() - .boolean_buffer(), + .bit_buffer(), &BooleanBuffer::from(vec![false, true]) ); assert_eq!( stats_table.array.fields()[3] .to_bool() .vortex_unwrap() - .boolean_buffer(), + .bit_buffer(), &BooleanBuffer::from(vec![true, false]) ); } @@ -278,14 +278,14 @@ mod tests { stats_table.array.fields()[1] .to_bool() .vortex_unwrap() - .boolean_buffer(), + .bit_buffer(), &BooleanBuffer::from(vec![false]) ); assert_eq!( stats_table.array.fields()[3] .to_bool() .vortex_unwrap() - .boolean_buffer(), + .bit_buffer(), &BooleanBuffer::from(vec![false]) ); }