|
1 | 1 | import logging
|
2 | 2 |
|
| 3 | +# assume for now that we want at least 20 values per bucket (valid?) |
| 4 | +# also that the smallest useful bucket size is at 1/100 of the total range |
| 5 | +MAX_BUCKETS = 100 |
| 6 | +MIN_BUCKET_COUNT = 20 |
| 7 | + |
| 8 | +''' |
| 9 | +A note about terminology: |
| 10 | +Bucket size: The range of values contained in a bucket |
| 11 | +Bucket count: The number of values contained in a bucket |
| 12 | +Bucket number: The number of buckets contained in a range |
| 13 | +
|
| 14 | +For example for 1000 values in the range 0 - 100, we may have: |
| 15 | +Bucket size: 5 |
| 16 | +Bucket count: 50 on average |
| 17 | +Bucket number: 20 |
| 18 | +''' |
| 19 | + |
3 | 20 |
|
4 | 21 | class Buckets():
|
5 | 22 | def __init__(self):
|
6 | 23 | self.buckets = sorted([base * (10 ** exponent)
|
7 | 24 | for base in [1, 2, 5] for exponent in range(-4, 20)])
|
8 | 25 |
|
9 |
| - def estimate_bucket_size(self, lower, upper): |
10 |
| - # If the lower bound is higher than the upper bound, prioritise the lower bound |
11 |
| - # to avoid having too many useless buckets |
12 |
| - # |
13 |
| - if lower > upper: |
14 |
| - return self._next_after(lower) |
| 26 | + def estimate_bucket_size(self, value_range: float, value_count: int, |
| 27 | + num_buckets=MAX_BUCKETS, min_bucket_count=MIN_BUCKET_COUNT) -> int: |
| 28 | + '''Estimate a suitable bucket size based on desired precision and size restrictions. |
| 29 | +
|
| 30 | + :param value_range: The size of the value range to be bucketed. |
| 31 | + :param value_count: The number of values contained in the dataset. |
| 32 | + :param num_buckets: The desired number of buckets for sufficient precision / resolution. |
| 33 | + :param min_bucket_count: The lowest number of values desired in each bucket |
| 34 | + :returns: A suitable bucket size.count |
| 35 | +
|
| 36 | + For example, the dataset contains 10_000 values in the range 2042 -> 5683. |
| 37 | + The value_range is 5683 - 2042 = 3641 |
| 38 | + If num_buckets is 100, the average bucket size is 36.4 for an estimated bucket count |
| 39 | + of 100. |
| 40 | + At the min_bucket_count of 20 we would have 10_000 / 20 = 500 buckets of size 3641 / 500 = 7.2. |
| 41 | + So we would like a bucket size of at least 7.2 and we are targeting 36.4 for sufficient precision. |
| 42 | + In this range there are two suitable bucket sizes: 10 and 20. |
| 43 | + >>> Buckets().estimate_bucket_size(5683 - 2042, 10_000) |
| 44 | + 10 |
| 45 | +
|
| 46 | + Note: |
| 47 | + - The returned size may not meet both of the desired criteria. |
| 48 | + - The min_bucket_count takes priority. |
| 49 | + ''' |
| 50 | + # Estimate lower and upper bounds for the bucket size |
| 51 | + precision_bound = value_range / num_buckets |
| 52 | + size_bound = value_range / (value_count / min_bucket_count) |
| 53 | + |
| 54 | + bs_candidates = self.buckets_in_range(size_bound, precision_bound) |
| 55 | + |
| 56 | + if len(bs_candidates) == 0: |
| 57 | + # No bucket sizes within the range, prioritise the size bound |
| 58 | + return self._next_after(size_bound) |
15 | 59 | else:
|
16 |
| - bs_candidate_lower = self._next_after(lower) |
17 |
| - bs_candidate_upper = self._first_before(upper) |
18 |
| - if bs_candidate_lower == bs_candidate_upper: |
19 |
| - return bs_candidate_lower |
20 |
| - else: |
21 |
| - # If both estimates fall outside the intended range, choose the closest |
22 |
| - if bs_candidate_upper < lower and bs_candidate_lower > upper: |
23 |
| - diff_below = lower - bs_candidate_upper |
24 |
| - diff_above = bs_candidate_lower - upper |
25 |
| - if diff_below < diff_above: |
26 |
| - return bs_candidate_upper |
27 |
| - else: |
28 |
| - return bs_candidate_lower |
29 |
| - # Otherwise if the lower estimate is within the bounds, choose it |
30 |
| - elif bs_candidate_lower < upper: |
31 |
| - return bs_candidate_lower |
32 |
| - # Otherwise check that the upper estimate is within bounds, if so, choose it |
33 |
| - elif bs_candidate_upper > lower: |
34 |
| - return bs_candidate_upper |
35 |
| - # If none of these conditions apply, something has gone wrong... |
36 |
| - else: |
37 |
| - logging.error( |
38 |
| - f'Unable to estimate bucket size for range {lower} -> {upper}') |
39 |
| - return 0 |
| 60 | + # Otherwise choose the largest bucket size within the range |
| 61 | + return max(bs_candidates) |
| 62 | + |
| 63 | + # bs_candidate_lower = self._next_after(lower) |
| 64 | + # bs_candidate_upper = self._first_before(upper) |
| 65 | + # if bs_candidate_lower == bs_candidate_upper: |
| 66 | + # # There is only one bucket size that falls within the desired range |
| 67 | + # return bs_candidate_lower |
| 68 | + # else: |
| 69 | + # # If both estimates fall outside the intended range, choose estimate |
| 70 | + # # based on the lower bound |
| 71 | + # return bs_candidate_lower |
| 72 | + # # Otherwise if the lower estimate is within the bounds, choose it |
| 73 | + # elif bs_candidate_lower < upper: |
| 74 | + # return bs_candidate_lower |
| 75 | + # # Otherwise check that the upper estimate is within bounds, if so, choose it |
| 76 | + # elif bs_candidate_upper > lower: |
| 77 | + # return bs_candidate_upper |
| 78 | + # # If none of these conditions apply, something has gone wrong... |
| 79 | + # else: |
| 80 | + # logging.error( |
| 81 | + # f'Unable to estimate bucket size for range {lower} -> {upper}') |
| 82 | + # return 0 |
40 | 83 |
|
41 | 84 | def _next_after(self, val):
|
42 | 85 | return next(v for v in self.buckets if v > val)
|
43 | 86 |
|
44 | 87 | def _first_before(self, val):
|
45 | 88 | return next(v for v in reversed(self.buckets) if v < val)
|
| 89 | + |
| 90 | + def buckets_smaller_than(self, val): |
| 91 | + return (v for v in self.buckets if v < val) |
| 92 | + |
| 93 | + def buckets_larger_than(self, val): |
| 94 | + return (v for v in self.buckets if v > val) |
| 95 | + |
| 96 | + def buckets_in_range(self, lo, hi) -> set: |
| 97 | + smaller_than_hi = set(self.buckets_smaller_than(hi)) |
| 98 | + larger_than_lo = set(self.buckets_larger_than(lo)) |
| 99 | + return smaller_than_hi & larger_than_lo |
| 100 | + |
| 101 | + |
| 102 | +if __name__ == "__main__": |
| 103 | + import doctest |
| 104 | + doctest.testmod() |
0 commit comments