4
4
from itertools import takewhile , dropwhile
5
5
6
6
7
+ '''TREE_BASES determines the bucket sizes that are used to build the tree.
8
+ [1, 5] means that base-1 and base-5 buckets are used, eg: 500->100->50->10 etc
9
+ '''
10
+ TREE_BASES = [1 , 5 ]
11
+
12
+
7
13
class BucketTree :
8
- def __init__ (self , root_bucket_size , root_bucket ):
9
- self ._root_bucket_size = root_bucket_size
10
- self . _buckets_by_level = {
11
- root_bucket_size : BucketLevel (
12
- bucket_size = root_bucket_size , buckets = [ root_bucket ])
14
+ def __init__ (self , unbucketed_range , unbucketed_data , total_count , suppressed_count ):
15
+ self ._unbucketed = {
16
+ 'range' : unbucketed_range ,
17
+ 'data' : unbucketed_data ,
18
+ 'suppressed' : suppressed_count
13
19
}
20
+ first_bucket = bu .estimate_bucket_size (unbucketed_range , total_count )
21
+ self ._to_explore = [b for b in bu .buckets_with_base (
22
+ TREE_BASES ) if b < first_bucket ]
14
23
15
- def levels_below (self , level ):
16
- '''For determining the next appropriate bucket size(s)
17
- '''
18
- return (bs for bs in bu .buckets_smaller_than (level ) if bu .base (bs ) != bu .base (self ._root_bucket_size ))
24
+ self ._to_explore .append (first_bucket )
25
+ self ._explored_buckets = {}
19
26
20
- def next_level (self ):
21
- smallest_level_so_far = min (self ._buckets_by_level .keys ())
22
- return next (self .levels_below (smallest_level_so_far ))
27
+ def next_levels (self , depth ):
28
+ return self ._to_explore [- depth :]
23
29
24
30
def buckets_at_level (self , level ):
25
- return self ._buckets_by_level .get (level )
31
+ return self ._explored_buckets .get (level ). as_flat_list ( )
26
32
27
- def root_bucket (self ):
28
- return self ._buckets_by_level [ self . _root_bucket_size ]
33
+ def bucket_levels (self ):
34
+ return list ( self ._explored_buckets . keys ())
29
35
30
- def insert_query_result (self , bucket_size , metadata , buckets ):
36
+ def insert_query_result (self , bucket_size , buckets , ** kwargs ):
31
37
'''Insert the result of a bucketed query
32
38
33
39
:param bucket_size: The bucket size at this level
34
40
:metadata: A dict containing extra data about this level of buckets (eg. column labels, suppressed values)
35
- :param buckets: Should be a list of (bucket_size, lower_bound, bucket_data)
41
+ :param buckets: Should be a list of `Bucket`s
36
42
'''
37
- assert bucket_size < self ._root_bucket_size , "Can't insert a bucket level above the root"
38
-
39
- assert bucket_size < min (self ._buckets_by_level .keys (
40
- )), "Inserting a bucket level above the lowest level is not yet supported"
41
-
42
- next_level = self .next_level ()
43
- assert bucket_size == next_level , f'Wrong bucket size, expected { next_level } '
43
+ next_level = self ._to_explore .pop ()
44
+ assert bucket_size == next_level , f'Wrong bucket size, expected { next_level } , got { bucket_size } '
44
45
46
+ metadata = dict (kwargs )
45
47
bl = BucketLevel (bucket_size = bucket_size ,
46
48
metadata = metadata , buckets = buckets )
47
49
48
- self ._buckets_by_level .update ({bucket_size : bl })
50
+ self ._explored_buckets .update ({bucket_size : bl })
49
51
50
52
def get_bucket (self , bucket ):
51
53
result = None
@@ -55,31 +57,32 @@ def get_bucket(self, bucket):
55
57
56
58
return result
57
59
60
+ def get_buckets (self , levels ):
61
+ if len (levels ) == 0 :
62
+ levels = self .bucket_levels ()
63
+ return [bucket for level in levels for bucket in self .buckets_at_level (level )]
64
+
58
65
59
- # TODO:
60
- # - BucketLevel constructor to take `parent` argument
61
- # - If `parent` is None, fill in gaps with zero-count buckets
62
- # - Otherwise interpolate missing buckets from the parent.
63
66
class BucketLevel :
64
67
'''Container class for buckets of the same size
65
68
'''
66
69
67
70
def __init__ (self , * , bucket_size , buckets , metadata = None , parent_level = None ):
68
71
'''
69
72
:param bucket_size: The bucket size at this level
70
- :param metadata: Metadata associated with this bucket level
71
- :param buckets: Should be a list of Bucket
73
+ :param metadata: Metadata associated with this bucket level
74
+ :param buckets: Should be a list of ` Bucket`s
72
75
:param parent: If the parent is not provided, fill in gaps between buckets
73
- with empty buckets (count = 0), otherwise interpolate missing buckets.
76
+ with empty buckets (count = 0), otherwise interpolate missing buckets.
74
77
'''
75
- self .bucket_size = bucket_size
76
- self .metadata = metadata
78
+ self ._bucket_size = bucket_size
79
+ self ._metadata = metadata
77
80
if parent_level is None :
78
81
fake_lo = min (bucket .lower_bound for bucket in buckets )
79
82
fake_hi = max (bucket .upper_bound () for bucket in buckets )
80
83
fake_count = sum (bucket .data .count for bucket in buckets )
81
84
parent_level = [
82
- Bucket (fake_hi - fake_lo , fake_lo , { 'count' : fake_count , 'min' : fake_lo , 'max' : fake_hi } )]
85
+ Bucket (fake_hi - fake_lo , fake_lo , [ fake_count , fake_lo , fake_hi ], FakeData )]
83
86
84
87
interpolated = []
85
88
bucket_iter = iter (buckets )
@@ -88,34 +91,46 @@ def __init__(self, *, bucket_size, buckets, metadata=None, parent_level=None):
88
91
takewhile (lambda small : parent_bucket .contains (small ), bucket_iter ))
89
92
interpolated += parent_bucket .interpolate_children (children )
90
93
91
- self .buckets = dict ([(bucket .lower_bound , bucket )
92
- for bucket in interpolated ])
94
+ self ._buckets = dict ([(bucket .lower_bound , bucket )
95
+ for bucket in interpolated ])
93
96
94
97
def get_bucket (self , bucket_size , lower_bound ):
95
- if bucket_size != self .bucket_size :
98
+ if bucket_size != self ._bucket_size :
96
99
return None
97
100
98
- return self .buckets .get (lower_bound )
101
+ return self ._buckets .get (lower_bound )
99
102
100
103
def buckets_in_range (self , range_lo , range_hi ):
101
- return (bucket for (lower_bound , bucket ) in self .buckets
104
+ return (bucket for (lower_bound , bucket ) in self ._buckets
102
105
if lower_bound >= range_lo and lower_bound < range_hi )
103
106
104
107
def add_metadata (self , metadata ):
105
- self .metadata .update (metadata )
108
+ self ._metadata .update (metadata )
109
+
110
+ def as_flat_list (self ):
111
+ return [bucket .flatten () for bucket in self ._buckets .values ()]
106
112
107
113
def __iter__ (self ):
108
- return self .buckets .values ()
114
+ return self ._buckets .values ()
115
+
116
+
117
+ QueryData = namedtuple ('QueryData' , 'count count_noise min max avg' )
118
+ FakeData = namedtuple ('FakeData' , 'count min max' )
119
+ SyntheticData = namedtuple ('SyntheticData' , 'count' )
120
+ EmptyData = namedtuple ('EmptyData' , '' )
109
121
110
122
111
123
class Bucket :
112
124
'''Container class for bucketed data
113
125
'''
114
126
115
- def __init__ (self , bucket_size , lower_bound , bucket_data = None ):
127
+ def __init__ (self , bucket_size , lower_bound , bucket_data , data_wrapper = QueryData ):
116
128
self .size = bucket_size
117
129
self .lower_bound = lower_bound
118
- self .data = BucketData (bucket_data )
130
+ if bucket_data is not None :
131
+ self .data = data_wrapper (* bucket_data )
132
+ else :
133
+ self .data = EmptyData ()
119
134
120
135
def __eq__ (self , other ):
121
136
return self .size == other .size and self .lower_bound == other .lower_bound
@@ -126,6 +141,26 @@ def __hash__(self):
126
141
def __str__ (self ):
127
142
return f'Bucket({ self .lower_bound } - { self .lower_bound + self .size } )'
128
143
144
+ def index (self ):
145
+ return (self .size , self .lower_bound )
146
+
147
+ def upper_bound (self ):
148
+ return self .lower_bound + self .size
149
+
150
+ def contains (self , other ):
151
+ return self .lower_bound <= other .lower_bound and self .upper_bound () >= other .upper_bound ()
152
+
153
+ def parent_index (self , parent_size ):
154
+ # TODO: add some assertions / restrictions regarding buckets sizes
155
+ return (parent_size , (self .lower_bound // parent_size ) * parent_size )
156
+
157
+ def child_indices (self , child_size ):
158
+ # TODO: add some assertions / restrictions regarding buckets sizes
159
+ return [(child_size , i ) for i in range (self .lower_bound , self .lower_bound + self .size , child_size )]
160
+
161
+ def flatten (self ):
162
+ return [self .size , self .lower_bound , * self .data ]
163
+
129
164
def split_to_size (self , smaller_size ):
130
165
'''Split a bucket into multiple buckets of a smaller size
131
166
'''
@@ -137,22 +172,15 @@ def split_to_size(self, smaller_size):
137
172
# return None to signal that the desired bucket size doesn't fit
138
173
return None
139
174
140
- upper_bound = self .lower_bound + self .size
141
- return [Bucket (smaller_size , lower_bound ) for lower_bound in range (lower_bound , upper_bound , smaller_size )]
142
-
143
- def upper_bound (self ):
144
- return self .lower_bound + self .size
145
-
146
- def contains (self , other ):
147
- return self .lower_bound >= other .lower_bound and self .upper_bound () <= other .upper_bound ()
175
+ return [Bucket (* index ) for index in self .child_indices (smaller_size )]
148
176
149
- def interpolate_children (self , small_buckets : List [ Bucket ] ):
150
- '''Interpolate gaps in small buckets from a larger one.
177
+ def interpolate_children (self , small_buckets ):
178
+ '''Interpolate gaps in small buckets from a larger one.
151
179
'''
152
180
small_bucket_size = small_buckets [0 ].size
153
181
assert self .size % small_bucket_size == 0 , f'Bucket { self .size } does not divide exactly into buckets of size { small_bucket_size } '
154
182
155
- small_buckets_expected_num = self .size / small_bucket_size
183
+ small_buckets_expected_num = int ( self .size // small_bucket_size )
156
184
if small_buckets_expected_num == len (small_buckets ):
157
185
return small_buckets
158
186
@@ -168,24 +196,9 @@ def interpolate_children(self, small_buckets: List[Bucket]):
168
196
sum (bucket .data .count for bucket in small_buckets )
169
197
count_per_bucket = missing_total / len (missing_lower_bounds )
170
198
171
- synthetic_buckets = [Bucket (small_bucket_size , lower_bound , {
172
- 'count' : count_per_bucket }) for lower_bound in missing_lower_bounds ]
199
+ synthetic_buckets = [Bucket (small_bucket_size , lower_bound , [ count_per_bucket ], data_wrapper = SyntheticData )
200
+ for lower_bound in missing_lower_bounds ]
173
201
174
202
small_buckets += synthetic_buckets
175
203
small_buckets .sort (key = lambda bucket : bucket .lower_bound )
176
204
return small_buckets
177
-
178
-
179
- class BucketData :
180
- def __init__ (self , bucket_data ):
181
- if type (bucket_data ) == type ([]):
182
- self .count = sum ([data ['count' ] for data in bucket_data ])
183
- self .min = min ([data ['min' ] for data in bucket_data ])
184
- self .max = max ([data ['max' ] for data in bucket_data ])
185
- else :
186
- self .count = bucket_data ['count' ]
187
- self .min = bucket_data ['min' ]
188
- self .max = bucket_data ['max' ]
189
-
190
- def __str__ (self ):
191
- return f'BucketData(count: { self .count } , min: { self .min } , max: { self .max } )'
0 commit comments