@@ -27,8 +27,12 @@ use arrow::record_batch::RecordBatch;
27
27
use datafusion_common:: Result ;
28
28
use datafusion_execution:: memory_pool:: proxy:: VecAllocExt ;
29
29
use datafusion_expr:: EmitTo ;
30
+ use datafusion_functions_aggregate_common:: aggregate:: groups_accumulator:: {
31
+ BlockedGroupIndexOperations , FlatGroupIndexOperations , GroupIndexOperations ,
32
+ } ;
30
33
use half:: f16;
31
34
use hashbrown:: hash_table:: HashTable ;
35
+ use std:: collections:: VecDeque ;
32
36
use std:: mem:: size_of;
33
37
use std:: sync:: Arc ;
34
38
@@ -81,17 +85,31 @@ hash_float!(f16, f32, f64);
81
85
pub struct GroupValuesPrimitive < T : ArrowPrimitiveType > {
82
86
/// The data type of the output array
83
87
data_type : DataType ,
88
+
84
89
/// Stores the group index based on the hash of its value
85
90
///
86
91
/// We don't store the hashes as hashing fixed width primitives
87
92
/// is fast enough for this not to benefit performance
88
- map : HashTable < usize > ,
93
+ map : HashTable < u64 > ,
94
+
89
95
/// The group index of the null value if any
90
- null_group : Option < usize > ,
96
+ null_group : Option < u64 > ,
97
+
91
98
/// The values for each group index
92
- values : Vec < T :: Native > ,
99
+ values : VecDeque < Vec < T :: Native > > ,
100
+
93
101
/// The random state used to generate hashes
94
102
random_state : RandomState ,
103
+
104
+ /// Block size of current `GroupValues` if exist:
105
+ /// - If `None`, it means block optimization is disabled,
106
+ /// all `group values`` will be stored in a single `Vec`
107
+ ///
108
+ /// - If `Some(blk_size)`, it means block optimization is enabled,
109
+ /// `group values` will be stored in multiple `Vec`s, and each
110
+ /// `Vec` if of `blk_size` len, and we call it a `block`
111
+ ///
112
+ block_size : Option < usize > ,
95
113
}
96
114
97
115
impl < T : ArrowPrimitiveType > GroupValuesPrimitive < T > {
@@ -100,9 +118,10 @@ impl<T: ArrowPrimitiveType> GroupValuesPrimitive<T> {
100
118
Self {
101
119
data_type,
102
120
map : HashTable :: with_capacity ( 128 ) ,
103
- values : Vec :: with_capacity ( 128 ) ,
121
+ values : VecDeque :: new ( ) ,
104
122
null_group : None ,
105
123
random_state : Default :: default ( ) ,
124
+ block_size : None ,
106
125
}
107
126
}
108
127
}
@@ -112,43 +131,30 @@ where
112
131
T :: Native : HashValue ,
113
132
{
114
133
fn intern ( & mut self , cols : & [ ArrayRef ] , groups : & mut Vec < usize > ) -> Result < ( ) > {
115
- assert_eq ! ( cols. len( ) , 1 ) ;
116
- groups. clear ( ) ;
117
-
118
- for v in cols[ 0 ] . as_primitive :: < T > ( ) {
119
- let group_id = match v {
120
- None => * self . null_group . get_or_insert_with ( || {
121
- let group_id = self . values . len ( ) ;
122
- self . values . push ( Default :: default ( ) ) ;
123
- group_id
124
- } ) ,
125
- Some ( key) => {
126
- let state = & self . random_state ;
127
- let hash = key. hash ( state) ;
128
- let insert = self . map . entry (
129
- hash,
130
- |g| unsafe { self . values . get_unchecked ( * g) . is_eq ( key) } ,
131
- |g| unsafe { self . values . get_unchecked ( * g) . hash ( state) } ,
132
- ) ;
133
-
134
- match insert {
135
- hashbrown:: hash_table:: Entry :: Occupied ( o) => * o. get ( ) ,
136
- hashbrown:: hash_table:: Entry :: Vacant ( v) => {
137
- let g = self . values . len ( ) ;
138
- v. insert ( g) ;
139
- self . values . push ( key) ;
140
- g
141
- }
142
- }
134
+ if let Some ( block_size) = self . block_size {
135
+ let before_add_group = |group_values : & mut VecDeque < Vec < T :: Native > > | {
136
+ if group_values. back ( ) . unwrap ( ) . len ( ) == block_size {
137
+ let new_block = Vec :: with_capacity ( block_size) ;
138
+ group_values. push_back ( new_block) ;
143
139
}
144
140
} ;
145
- groups. push ( group_id)
141
+ self . get_or_create_groups :: < _ , BlockedGroupIndexOperations > (
142
+ cols,
143
+ groups,
144
+ before_add_group,
145
+ )
146
+ } else {
147
+ self . get_or_create_groups :: < _ , FlatGroupIndexOperations > (
148
+ cols,
149
+ groups,
150
+ |_: & mut VecDeque < Vec < T :: Native > > | { } ,
151
+ )
146
152
}
147
- Ok ( ( ) )
148
153
}
149
154
150
155
fn size ( & self ) -> usize {
151
- self . map . capacity ( ) * size_of :: < usize > ( ) + self . values . allocated_size ( )
156
+ todo ! ( )
157
+ // self.map.capacity() * size_of::<usize>() + self.values.len
152
158
}
153
159
154
160
fn is_empty ( & self ) -> bool {
@@ -160,54 +166,55 @@ where
160
166
}
161
167
162
168
fn emit ( & mut self , emit_to : EmitTo ) -> Result < Vec < ArrayRef > > {
163
- fn build_primitive < T : ArrowPrimitiveType > (
164
- values : Vec < T :: Native > ,
165
- null_idx : Option < usize > ,
166
- ) -> PrimitiveArray < T > {
167
- let nulls = null_idx. map ( |null_idx| {
168
- let mut buffer = NullBufferBuilder :: new ( values. len ( ) ) ;
169
- buffer. append_n_non_nulls ( null_idx) ;
170
- buffer. append_null ( ) ;
171
- buffer. append_n_non_nulls ( values. len ( ) - null_idx - 1 ) ;
172
- // NOTE: The inner builder must be constructed as there is at least one null
173
- buffer. finish ( ) . unwrap ( )
174
- } ) ;
175
- PrimitiveArray :: < T > :: new ( values. into ( ) , nulls)
176
- }
169
+ todo ! ( )
170
+ // fn build_primitive<T: ArrowPrimitiveType>(
171
+ // values: Vec<T::Native>,
172
+ // null_idx: Option<usize>,
173
+ // ) -> PrimitiveArray<T> {
174
+ // let nulls = null_idx.map(|null_idx| {
175
+ // let mut buffer = NullBufferBuilder::new(values.len());
176
+ // buffer.append_n_non_nulls(null_idx);
177
+ // buffer.append_null();
178
+ // buffer.append_n_non_nulls(values.len() - null_idx - 1);
179
+ // // NOTE: The inner builder must be constructed as there is at least one null
180
+ // buffer.finish().unwrap()
181
+ // });
182
+ // PrimitiveArray::<T>::new(values.into(), nulls)
183
+ // }
177
184
178
- let array: PrimitiveArray < T > = match emit_to {
179
- EmitTo :: All => {
180
- self . map . clear ( ) ;
181
- build_primitive ( std:: mem:: take ( & mut self . values ) , self . null_group . take ( ) )
182
- }
183
- EmitTo :: First ( n) => {
184
- self . map . retain ( |group_idx| {
185
- // Decrement group index by n
186
- match group_idx. checked_sub ( n) {
187
- // Group index was >= n, shift value down
188
- Some ( sub) => {
189
- * group_idx = sub;
190
- true
191
- }
192
- // Group index was < n, so remove from table
193
- None => false ,
194
- }
195
- } ) ;
196
- let null_group = match & mut self . null_group {
197
- Some ( v) if * v >= n => {
198
- * v -= n;
199
- None
200
- }
201
- Some ( _) => self . null_group . take ( ) ,
202
- None => None ,
203
- } ;
204
- let mut split = self . values . split_off ( n) ;
205
- std:: mem:: swap ( & mut self . values , & mut split) ;
206
- build_primitive ( split, null_group)
207
- }
208
- } ;
185
+ // let array: PrimitiveArray<T> = match emit_to {
186
+ // EmitTo::All => {
187
+ // self.map.clear();
188
+ // build_primitive(std::mem::take(&mut self.values), self.null_group.take())
189
+ // }
190
+ // EmitTo::First(n) => {
191
+ // self.map.retain(|group_idx| {
192
+ // // Decrement group index by n
193
+ // match group_idx.checked_sub(n) {
194
+ // // Group index was >= n, shift value down
195
+ // Some(sub) => {
196
+ // *group_idx = sub;
197
+ // true
198
+ // }
199
+ // // Group index was < n, so remove from table
200
+ // None => false,
201
+ // }
202
+ // });
203
+ // let null_group = match &mut self.null_group {
204
+ // Some(v) if *v >= n => {
205
+ // *v -= n;
206
+ // None
207
+ // }
208
+ // Some(_) => self.null_group.take(),
209
+ // None => None,
210
+ // };
211
+ // let mut split = self.values.split_off(n);
212
+ // std::mem::swap(&mut self.values, &mut split);
213
+ // build_primitive(split, null_group)
214
+ // }
215
+ // };
209
216
210
- Ok ( vec ! [ Arc :: new( array. with_data_type( self . data_type. clone( ) ) ) ] )
217
+ // Ok(vec![Arc::new(array.with_data_type(self.data_type.clone()))])
211
218
}
212
219
213
220
fn clear_shrink ( & mut self , batch : & RecordBatch ) {
@@ -218,3 +225,86 @@ where
218
225
self . map . shrink_to ( count, |_| 0 ) ; // hasher does not matter since the map is cleared
219
226
}
220
227
}
228
+
229
+ impl < T : ArrowPrimitiveType > GroupValuesPrimitive < T >
230
+ where
231
+ T :: Native : HashValue ,
232
+ {
233
+ fn get_or_create_groups < F , O > (
234
+ & mut self ,
235
+ cols : & [ ArrayRef ] ,
236
+ groups : & mut Vec < usize > ,
237
+ mut before_add_group : F ,
238
+ ) -> Result < ( ) >
239
+ where
240
+ F : FnMut ( & mut VecDeque < Vec < T :: Native > > ) ,
241
+ O : GroupIndexOperations ,
242
+ {
243
+ assert_eq ! ( cols. len( ) , 1 ) ;
244
+ groups. clear ( ) ;
245
+
246
+ for v in cols[ 0 ] . as_primitive :: < T > ( ) {
247
+ let group_index = match v {
248
+ None => * self . null_group . get_or_insert_with ( || {
249
+ // actions before add new group like checking if room is enough
250
+ before_add_group ( & mut self . values ) ;
251
+
252
+ // get block infos and update block
253
+ let block_id = self . values . len ( ) as u32 ;
254
+ let current_block = self . values . back_mut ( ) . unwrap ( ) ;
255
+ let block_offset = current_block. len ( ) as u64 ;
256
+ current_block. push ( Default :: default ( ) ) ;
257
+
258
+ // get group index and finish actions needed it
259
+ O :: pack_index ( block_id, block_offset)
260
+ } ) ,
261
+ Some ( key) => {
262
+ let state = & self . random_state ;
263
+ let hash = key. hash ( state) ;
264
+ let insert = self . map . entry (
265
+ hash,
266
+ |g| unsafe {
267
+ let block_id = O :: get_block_id ( * g) ;
268
+ let block_offset = O :: get_block_offset ( * g) ;
269
+ self . values
270
+ . get ( block_id as usize )
271
+ . unwrap ( )
272
+ . get_unchecked ( block_offset as usize )
273
+ . is_eq ( key)
274
+ } ,
275
+ |g| unsafe {
276
+ let block_id = O :: get_block_id ( * g) ;
277
+ let block_offset = O :: get_block_offset ( * g) ;
278
+ self . values
279
+ . get ( block_id as usize )
280
+ . unwrap ( )
281
+ . get_unchecked ( block_offset as usize )
282
+ . hash ( state)
283
+ } ,
284
+ ) ;
285
+
286
+ match insert {
287
+ hashbrown:: hash_table:: Entry :: Occupied ( o) => * o. get ( ) ,
288
+ hashbrown:: hash_table:: Entry :: Vacant ( v) => {
289
+ // actions before add new group like checking if room is enough
290
+ before_add_group ( & mut self . values ) ;
291
+
292
+ // get block infos and update block
293
+ let block_id = self . values . len ( ) as u32 ;
294
+ let current_block = self . values . back_mut ( ) . unwrap ( ) ;
295
+ let block_offset = current_block. len ( ) as u64 ;
296
+ current_block. push ( key) ;
297
+
298
+ // get group index and finish actions needed it
299
+ let packed_index = O :: pack_index ( block_id, block_offset) ;
300
+ v. insert ( packed_index) ;
301
+ packed_index
302
+ }
303
+ }
304
+ }
305
+ } ;
306
+ groups. push ( group_index as usize )
307
+ }
308
+ Ok ( ( ) )
309
+ }
310
+ }
0 commit comments