Skip to content
This repository was archived by the owner on Jul 2, 2024. It is now read-only.

Commit f487b1c

Browse files
Daniel LennonDaniel Lennon
Daniel Lennon
authored and
Daniel Lennon
committed
Finish up initial bucket tree implementation
1 parent 10d027f commit f487b1c

File tree

2 files changed

+119
-123
lines changed

2 files changed

+119
-123
lines changed

explorer/bucket_tree.py

+85-72
Original file line numberDiff line numberDiff line change
@@ -4,48 +4,50 @@
44
from itertools import takewhile, dropwhile
55

66

7+
'''TREE_BASES determines the bucket sizes that are used to build the tree.
8+
[1, 5] means that base-1 and base-5 buckets are used, eg: 500->100->50->10 etc
9+
'''
10+
TREE_BASES = [1, 5]
11+
12+
713
class BucketTree:
8-
def __init__(self, root_bucket_size, root_bucket):
9-
self._root_bucket_size = root_bucket_size
10-
self._buckets_by_level = {
11-
root_bucket_size: BucketLevel(
12-
bucket_size=root_bucket_size, buckets=[root_bucket])
14+
def __init__(self, unbucketed_range, unbucketed_data, total_count, suppressed_count):
15+
self._unbucketed = {
16+
'range': unbucketed_range,
17+
'data': unbucketed_data,
18+
'suppressed': suppressed_count
1319
}
20+
first_bucket = bu.estimate_bucket_size(unbucketed_range, total_count)
21+
self._to_explore = [b for b in bu.buckets_with_base(
22+
TREE_BASES) if b < first_bucket]
1423

15-
def levels_below(self, level):
16-
'''For determining the next appropriate bucket size(s)
17-
'''
18-
return (bs for bs in bu.buckets_smaller_than(level) if bu.base(bs) != bu.base(self._root_bucket_size))
24+
self._to_explore.append(first_bucket)
25+
self._explored_buckets = {}
1926

20-
def next_level(self):
21-
smallest_level_so_far = min(self._buckets_by_level.keys())
22-
return next(self.levels_below(smallest_level_so_far))
27+
def next_levels(self, depth):
28+
return self._to_explore[-depth:]
2329

2430
def buckets_at_level(self, level):
25-
return self._buckets_by_level.get(level)
31+
return self._explored_buckets.get(level).as_flat_list()
2632

27-
def root_bucket(self):
28-
return self._buckets_by_level[self._root_bucket_size]
33+
def bucket_levels(self):
34+
return list(self._explored_buckets.keys())
2935

30-
def insert_query_result(self, bucket_size, metadata, buckets):
36+
def insert_query_result(self, bucket_size, buckets, **kwargs):
3137
'''Insert the result of a bucketed query
3238
3339
:param bucket_size: The bucket size at this level
3440
:metadata: A dict containing extra data about this level of buckets (eg. column labels, suppressed values)
35-
:param buckets: Should be a list of (bucket_size, lower_bound, bucket_data)
41+
:param buckets: Should be a list of `Bucket`s
3642
'''
37-
assert bucket_size < self._root_bucket_size, "Can't insert a bucket level above the root"
38-
39-
assert bucket_size < min(self._buckets_by_level.keys(
40-
)), "Inserting a bucket level above the lowest level is not yet supported"
41-
42-
next_level = self.next_level()
43-
assert bucket_size == next_level, f'Wrong bucket size, expected {next_level}'
43+
next_level = self._to_explore.pop()
44+
assert bucket_size == next_level, f'Wrong bucket size, expected {next_level}, got {bucket_size}'
4445

46+
metadata = dict(kwargs)
4547
bl = BucketLevel(bucket_size=bucket_size,
4648
metadata=metadata, buckets=buckets)
4749

48-
self._buckets_by_level.update({bucket_size: bl})
50+
self._explored_buckets.update({bucket_size: bl})
4951

5052
def get_bucket(self, bucket):
5153
result = None
@@ -55,31 +57,32 @@ def get_bucket(self, bucket):
5557

5658
return result
5759

60+
def get_buckets(self, levels):
61+
if len(levels) == 0:
62+
levels = self.bucket_levels()
63+
return [bucket for level in levels for bucket in self.buckets_at_level(level)]
64+
5865

59-
# TODO:
60-
# - BucketLevel constructor to take `parent` argument
61-
# - If `parent` is None, fill in gaps with zero-count buckets
62-
# - Otherwise interpolate missing buckets from the parent.
6366
class BucketLevel:
6467
'''Container class for buckets of the same size
6568
'''
6669

6770
def __init__(self, *, bucket_size, buckets, metadata=None, parent_level=None):
6871
'''
6972
:param bucket_size: The bucket size at this level
70-
:param metadata: Metadata associated with this bucket level
71-
:param buckets: Should be a list of Bucket
73+
:param metadata: Metadata associated with this bucket level
74+
:param buckets: Should be a list of `Bucket`s
7275
:param parent: If the parent is not provided, fill in gaps between buckets
73-
with empty buckets (count = 0), otherwise interpolate missing buckets.
76+
with empty buckets (count = 0), otherwise interpolate missing buckets.
7477
'''
75-
self.bucket_size = bucket_size
76-
self.metadata = metadata
78+
self._bucket_size = bucket_size
79+
self._metadata = metadata
7780
if parent_level is None:
7881
fake_lo = min(bucket.lower_bound for bucket in buckets)
7982
fake_hi = max(bucket.upper_bound() for bucket in buckets)
8083
fake_count = sum(bucket.data.count for bucket in buckets)
8184
parent_level = [
82-
Bucket(fake_hi - fake_lo, fake_lo, {'count': fake_count, 'min': fake_lo, 'max': fake_hi})]
85+
Bucket(fake_hi - fake_lo, fake_lo, [fake_count, fake_lo, fake_hi], FakeData)]
8386

8487
interpolated = []
8588
bucket_iter = iter(buckets)
@@ -88,34 +91,46 @@ def __init__(self, *, bucket_size, buckets, metadata=None, parent_level=None):
8891
takewhile(lambda small: parent_bucket.contains(small), bucket_iter))
8992
interpolated += parent_bucket.interpolate_children(children)
9093

91-
self.buckets = dict([(bucket.lower_bound, bucket)
92-
for bucket in interpolated])
94+
self._buckets = dict([(bucket.lower_bound, bucket)
95+
for bucket in interpolated])
9396

9497
def get_bucket(self, bucket_size, lower_bound):
95-
if bucket_size != self.bucket_size:
98+
if bucket_size != self._bucket_size:
9699
return None
97100

98-
return self.buckets.get(lower_bound)
101+
return self._buckets.get(lower_bound)
99102

100103
def buckets_in_range(self, range_lo, range_hi):
101-
return (bucket for (lower_bound, bucket) in self.buckets
104+
return (bucket for (lower_bound, bucket) in self._buckets
102105
if lower_bound >= range_lo and lower_bound < range_hi)
103106

104107
def add_metadata(self, metadata):
105-
self.metadata.update(metadata)
108+
self._metadata.update(metadata)
109+
110+
def as_flat_list(self):
111+
return [bucket.flatten() for bucket in self._buckets.values()]
106112

107113
def __iter__(self):
108-
return self.buckets.values()
114+
return self._buckets.values()
115+
116+
117+
QueryData = namedtuple('QueryData', 'count count_noise min max avg')
118+
FakeData = namedtuple('FakeData', 'count min max')
119+
SyntheticData = namedtuple('SyntheticData', 'count')
120+
EmptyData = namedtuple('EmptyData', '')
109121

110122

111123
class Bucket:
112124
'''Container class for bucketed data
113125
'''
114126

115-
def __init__(self, bucket_size, lower_bound, bucket_data=None):
127+
def __init__(self, bucket_size, lower_bound, bucket_data, data_wrapper=QueryData):
116128
self.size = bucket_size
117129
self.lower_bound = lower_bound
118-
self.data = BucketData(bucket_data)
130+
if bucket_data is not None:
131+
self.data = data_wrapper(*bucket_data)
132+
else:
133+
self.data = EmptyData()
119134

120135
def __eq__(self, other):
121136
return self.size == other.size and self.lower_bound == other.lower_bound
@@ -126,6 +141,26 @@ def __hash__(self):
126141
def __str__(self):
127142
return f'Bucket({self.lower_bound} - {self.lower_bound + self.size})'
128143

144+
def index(self):
145+
return (self.size, self.lower_bound)
146+
147+
def upper_bound(self):
148+
return self.lower_bound + self.size
149+
150+
def contains(self, other):
151+
return self.lower_bound <= other.lower_bound and self.upper_bound() >= other.upper_bound()
152+
153+
def parent_index(self, parent_size):
154+
# TODO: add some assertions / restrictions regarding buckets sizes
155+
return (parent_size, (self.lower_bound // parent_size) * parent_size)
156+
157+
def child_indices(self, child_size):
158+
# TODO: add some assertions / restrictions regarding buckets sizes
159+
return [(child_size, i) for i in range(self.lower_bound, self.lower_bound + self.size, child_size)]
160+
161+
def flatten(self):
162+
return [self.size, self.lower_bound, *self.data]
163+
129164
def split_to_size(self, smaller_size):
130165
'''Split a bucket into multiple buckets of a smaller size
131166
'''
@@ -137,22 +172,15 @@ def split_to_size(self, smaller_size):
137172
# return None to signal that the desired bucket size doesn't fit
138173
return None
139174

140-
upper_bound = self.lower_bound + self.size
141-
return [Bucket(smaller_size, lower_bound) for lower_bound in range(lower_bound, upper_bound, smaller_size)]
142-
143-
def upper_bound(self):
144-
return self.lower_bound + self.size
145-
146-
def contains(self, other):
147-
return self.lower_bound >= other.lower_bound and self.upper_bound() <= other.upper_bound()
175+
return [Bucket(*index) for index in self.child_indices(smaller_size)]
148176

149-
def interpolate_children(self, small_buckets: List[Bucket]):
150-
'''Interpolate gaps in small buckets from a larger one.
177+
def interpolate_children(self, small_buckets):
178+
'''Interpolate gaps in small buckets from a larger one.
151179
'''
152180
small_bucket_size = small_buckets[0].size
153181
assert self.size % small_bucket_size == 0, f'Bucket {self.size} does not divide exactly into buckets of size {small_bucket_size}'
154182

155-
small_buckets_expected_num = self.size / small_bucket_size
183+
small_buckets_expected_num = int(self.size // small_bucket_size)
156184
if small_buckets_expected_num == len(small_buckets):
157185
return small_buckets
158186

@@ -168,24 +196,9 @@ def interpolate_children(self, small_buckets: List[Bucket]):
168196
sum(bucket.data.count for bucket in small_buckets)
169197
count_per_bucket = missing_total / len(missing_lower_bounds)
170198

171-
synthetic_buckets = [Bucket(small_bucket_size, lower_bound, {
172-
'count': count_per_bucket}) for lower_bound in missing_lower_bounds]
199+
synthetic_buckets = [Bucket(small_bucket_size, lower_bound, [count_per_bucket], data_wrapper=SyntheticData)
200+
for lower_bound in missing_lower_bounds]
173201

174202
small_buckets += synthetic_buckets
175203
small_buckets.sort(key=lambda bucket: bucket.lower_bound)
176204
return small_buckets
177-
178-
179-
class BucketData:
180-
def __init__(self, bucket_data):
181-
if type(bucket_data) == type([]):
182-
self.count = sum([data['count'] for data in bucket_data])
183-
self.min = min([data['min'] for data in bucket_data])
184-
self.max = max([data['max'] for data in bucket_data])
185-
else:
186-
self.count = bucket_data['count']
187-
self.min = bucket_data['min']
188-
self.max = bucket_data['max']
189-
190-
def __str__(self):
191-
return f'BucketData(count: {self.count}, min: {self.min}, max: {self.max})'

0 commit comments

Comments
 (0)