Switch from asyncpg to pscopg2

Daniel Lennon · Daniel Lennon · commit b07769296a05 · 2019-12-05T12:18:20.000+01:00
(asyncpg was incompatible with Aircloak)

Overhaul bucketing logic

Build queries using psycopg2.sql instead of string interpolation

Enable export of bucketed data as pandas contructor args

Add a jupyter notebook with basic examples

etc.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 .venv
 .vscode
+*.pyc
diff --git a/explorer/__init__.py b/explorer/__init__.py
diff --git a/explorer/buckets.py b/explorer/buckets.py
@@ -1,45 +1,104 @@
 import logging
 
+# assume for now that we want at least 20 values per bucket (valid?)
+# also that the smallest useful bucket size is at 1/100 of the total range
+MAX_BUCKETS = 100
+MIN_BUCKET_COUNT = 20
+
+'''
+A note about terminology:
+Bucket size: The range of values contained in a bucket
+Bucket count: The number of values contained in a bucket
+Bucket number: The number of buckets contained in a range
+
+For example for 1000 values in the range 0 - 100, we may have:
+Bucket size: 5
+Bucket count: 50 on average
+Bucket number: 20
+'''
+
 
 class Buckets():
     def __init__(self):
         self.buckets = sorted([base * (10 ** exponent)
                                for base in [1, 2, 5] for exponent in range(-4, 20)])
 
-    def estimate_bucket_size(self, lower, upper):
-        # If the lower bound is higher than the upper bound, prioritise the lower bound
-        # to avoid having too many useless buckets
-        #
-        if lower > upper:
-            return self._next_after(lower)
+    def estimate_bucket_size(self, value_range: float, value_count: int,
+                             num_buckets=MAX_BUCKETS, min_bucket_count=MIN_BUCKET_COUNT) -> int:
+        '''Estimate a suitable bucket size based on desired precision and size restrictions. 
+
+        :param value_range: The size of the value range to be bucketed. 
+        :param value_count: The number of values contained in the dataset. 
+        :param num_buckets: The desired number of buckets for sufficient precision / resolution. 
+        :param min_bucket_count: The lowest number of values desired in each bucket
+        :returns: A suitable bucket size.count
+
+        For example, the dataset contains 10_000 values in the range 2042 -> 5683.
+        The value_range is 5683 - 2042 = 3641 
+        If num_buckets is 100, the average bucket size is 36.4 for an estimated bucket count 
+        of 100. 
+        At the min_bucket_count of 20 we would have 10_000 / 20 = 500 buckets of size 3641 / 500 = 7.2. 
+        So we would like a bucket size of at least 7.2 and we are targeting 36.4 for sufficient precision. 
+        In this range there are two suitable bucket sizes: 10 and 20. 
+        >>> Buckets().estimate_bucket_size(5683 - 2042, 10_000)
+        10
+
+        Note: 
+            - The returned size may not meet both of the desired criteria. 
+            - The min_bucket_count takes priority. 
+        '''
+        # Estimate lower and upper bounds for the bucket size
+        precision_bound = value_range / num_buckets
+        size_bound = value_range / (value_count / min_bucket_count)
+
+        bs_candidates = self.buckets_in_range(size_bound, precision_bound)
+
+        if len(bs_candidates) == 0:
+            # No bucket sizes within the range, prioritise the size bound
+            return self._next_after(size_bound)
         else:
-            bs_candidate_lower = self._next_after(lower)
-            bs_candidate_upper = self._first_before(upper)
-            if bs_candidate_lower == bs_candidate_upper:
-                return bs_candidate_lower
-            else:
-                # If both estimates fall outside the intended range, choose the closest
-                if bs_candidate_upper < lower and bs_candidate_lower > upper:
-                    diff_below = lower - bs_candidate_upper
-                    diff_above = bs_candidate_lower - upper
-                    if diff_below < diff_above:
-                        return bs_candidate_upper
-                    else:
-                        return bs_candidate_lower
-                # Otherwise if the lower estimate is within the bounds, choose it
-                elif bs_candidate_lower < upper:
-                    return bs_candidate_lower
-                # Otherwise check that the upper estimate is within bounds, if so, choose it
-                elif bs_candidate_upper > lower:
-                    return bs_candidate_upper
-                # If none of these conditions apply, something has gone wrong...
-                else:
-                    logging.error(
-                        f'Unable to estimate bucket size for range {lower} -> {upper}')
-                    return 0
+            # Otherwise choose the largest bucket size within the range
+            return max(bs_candidates)
+
+            # bs_candidate_lower = self._next_after(lower)
+            # bs_candidate_upper = self._first_before(upper)
+            # if bs_candidate_lower == bs_candidate_upper:
+            #     # There is only one bucket size that falls within the desired range
+            #     return bs_candidate_lower
+            # else:
+            #     # If both estimates fall outside the intended range, choose estimate
+            #     # based on the lower bound
+            #     return bs_candidate_lower
+            #     # Otherwise if the lower estimate is within the bounds, choose it
+            #     elif bs_candidate_lower < upper:
+            #         return bs_candidate_lower
+            #     # Otherwise check that the upper estimate is within bounds, if so, choose it
+            #     elif bs_candidate_upper > lower:
+            #         return bs_candidate_upper
+            #     # If none of these conditions apply, something has gone wrong...
+            #     else:
+            #         logging.error(
+            #             f'Unable to estimate bucket size for range {lower} -> {upper}')
+            #         return 0
 
     def _next_after(self, val):
         return next(v for v in self.buckets if v > val)
 
     def _first_before(self, val):
         return next(v for v in reversed(self.buckets) if v < val)
+
+    def buckets_smaller_than(self, val):
+        return (v for v in self.buckets if v < val)
+
+    def buckets_larger_than(self, val):
+        return (v for v in self.buckets if v > val)
+
+    def buckets_in_range(self, lo, hi) -> set:
+        smaller_than_hi = set(self.buckets_smaller_than(hi))
+        larger_than_lo = set(self.buckets_larger_than(lo))
+        return smaller_than_hi & larger_than_lo
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
diff --git a/explorer/connection.py b/explorer/connection.py
@@ -1,23 +1,31 @@
-import asyncio
-import asyncpg
+import psycopg2
+from psycopg2.extras import DictCursor
 import logging
 
 
 class AircloakConnection():
-    def __init__(self, **kwargs):
+    def __init__(self, *, dbname):
         self.user = 'daniel-613C7ADF4535BB56DBCD'
         self.port = 9432
         self.host = 'attack.aircloak.com'
-        self.database = 'gda_banking'
+        self.dbname = dbname
 
-    async def connect(self):
-        self.conn = await asyncpg.connect(user=self.user, host=self.host, port=self.port, database=self.database)
+        logging.debug(
+            f'Connecting to Aircloak: user={self.user}, host={self.host}, port={self.port}, dbname={self.dbname}')
 
-    async def close(self):
-        await self.conn.close()
+        self.conn = psycopg2.connect(
+            user=self.user, host=self.host, port=self.port, dbname=self.dbname, cursor_factory=DictCursor)
+
+    def close(self):
+        self.conn.close()
+
+    def fetch(self, query):
+        logging.debug(f'Sending query: {query.as_string(self.conn)}')
+        with self.conn.cursor() as cur:
+            cur.execute(query)
+            result = {
+                'rows': cur.fetchall(),
+                'labels': [col.name for col in cur.description]
+            }
 
-    async def run_query(self, query_fn, **query_args):
-        query_str = query_fn(**query_args)
-        logging.debug(f'Querying: {query_str}')
-        result = await self.conn.fetch(query_str)
         return result
diff --git a/explorer/numeric.py b/explorer/numeric.py
@@ -1,68 +1,62 @@
-import asyncio
-import asyncpg
 import logging
 
-from buckets import Buckets
-from connection import AircloakConnection
+from .buckets import Buckets
+from .connection import AircloakConnection
 
-import queries
+from . import queries
 
-# assume for now that we want at least 20 values per bucket (valid?)
-# also that the smallest useful bucket size is at 1/100 of the total range
-MAX_BUCKETS = 100
-MIN_BUCKET_SIZE = 20
 
+class Explorer:
+    def __init__(self, *, dbname):
+        self.stats = {}
+        self.ac = AircloakConnection(dbname=dbname)
 
-async def explore_numeric_col(table: str, column: str, max_buckets=MAX_BUCKETS, min_bucket_size=MIN_BUCKET_SIZE):
-    ac = AircloakConnection()
-    await ac.connect()
+    def explore_numeric_col(self, *, table: str, column: str):
+        stats = self.ac.fetch(
+            queries.top_level_stats(table=table, column=column))
 
-    stats = await ac.run_query(queries.top_level_stats, table=table, column=column)
-    distincts = await ac.run_query(queries.top_level_distinct, table=table, column=column)
+        distincts = self.ac.fetch(queries.top_level_distinct(
+            table=table, column=column))
 
-    stats = stats[0]
-    count_total = stats['count']
-    suppresed_count = count_suppressed(distincts, 0)
+        stats = stats['rows'][0]
+        suppressed_count = count_suppressed(distincts['rows'], column)
 
-    suppressed_ratio = suppresed_count / count_total
+        suppressed_ratio = suppressed_count / stats['count']
 
-    if suppressed_ratio > 0.05:
-        # too many supressed values, lets drill down
-        value_range = stats['max'] - stats['min']
+        if suppressed_ratio > 0.05:
+            # too many supressed values, lets drill down
+            value_range = stats['max'] - stats['min']
 
-        # Estimate lower and upper bounds for the bucket size
-        bs_lower_bound = value_range / max_buckets
-        bs_upper_bound = value_range / (count_total / min_bucket_size)
+            bucket_size = Buckets().estimate_bucket_size(
+                value_range, stats['count'])
 
-        bucket_size = Buckets().estimate_bucket_size(bs_lower_bound, bs_upper_bound)
+            self.stats[(table, column)] = self.ac.fetch(
+                queries.bucketed_stats(table=table, column=column, bucket_size=bucket_size))
 
-        bucketed_stats = await ac.run_query(queries.bucketed_stats, table=table, column=column, bucket_size=bucket_size)
+            # TODO: check quality of returned buckets and, if necessary launch more queries with adjusted bucket size.
 
-        # TODO: check quality of returned buckets and, if necessary launch more queries.
+    def histogram(self, *, table, column):
+        stats = self.stats[(table, column)]['rows']
+        return [row['bucket'] for row in stats[1:]], [row['count'] for row in stats[1:]]
 
-    await ac.close()
+    def to_dataframe(self, *, table, column):
+        stats = self.stats[(table, column)]
+        return {
+            'data': stats['rows'],
+            'columns': stats['labels'],
+            'index': None,
+        }
 
-    return bucketed_stats
+    def __del__(self):
+        self.ac.close()
 
 
 def count_suppressed(rows, col, count_col='count'):
     return next(r[count_col] for r in rows if r[col] == None)
 
 
-def run_exp(exp):
-    loop = asyncio.get_event_loop()
-    loop.set_debug(True)
-    return loop.run_until_complete(exp)
-
-
-# if __name__ == "__main__":
-#     async def main():
-#         logging.basicConfig(level=logging.DEBUG)
-#         ac = AircloakConnection()
-#         await ac.connect()
-#         values = await ac.run_query('bucketed', table='loans', column='amount', bucket_size=10000)
-#         logging.debug(values)
-
-#     loop = asyncio.get_event_loop()
-#     loop.set_debug()
-#     loop.run_until_complete(main())
+if __name__ == "__main__":
+    e = Explorer(dbname='gda_banking')
+    e.explore_numeric_col(table='loans', column='amount')
+    x, y = e.histogram(table='loans', column='amount')
+    print(x, y)
diff --git a/explorer/queries.py b/explorer/queries.py
@@ -1,17 +1,20 @@
+import logging
+from psycopg2 import sql
 
-def top_level_distinct(table: str, column: str):
-    return f'''
+
+def top_level_distinct(*, table: str, column: str):
+    return sql.SQL('''
         SELECT
             {column}
         ,   count(*)
         FROM {table}
         GROUP BY 1
-        ORDER BY count DESC
-    '''
+        ORDER BY 2 DESC
+    ''').format(table=sql.Identifier(table), column=sql.Identifier(column))
 
 
-def top_level_stats(table: str, column: str):
-    return f'''
+def top_level_stats(*, table: str, column: str):
+    return sql.SQL('''
         SELECT
             min({column})
         ,   max({column})
@@ -20,11 +23,11 @@ def top_level_stats(table: str, column: str):
         ,   count(*)
         ,   count_noise(*)
         FROM {table}
-    '''
+    ''').format(table=sql.Identifier(table), column=sql.Identifier(column))
 
 
-def bucketed_stats(table: str, column: str, bucket_size: int):
-    return f'''
+def bucketed_stats(*, table: str, column: str, bucket_size: int):
+    return sql.SQL('''
         SELECT
             bucket({column} by {bucket_size})
         ,   {bucket_size} as bucket_size
@@ -36,4 +39,4 @@ def bucketed_stats(table: str, column: str, bucket_size: int):
         ,   count_noise(*)
         FROM {table}
         GROUP BY 1
-    '''
+    ''').format(table=sql.Identifier(table), column=sql.Identifier(column), bucket_size=sql.Literal(bucket_size))
diff --git a/visualisations.ipynb b/visualisations.ipynb