[OpenDataServices/cove#418] Reduce memory footprint

kindly · Bjwebb · commit d1fe3660bcaf · 2016-06-23T15:08:53.000+01:00
Move around some things to stop data being copied.
Reduce memory for when no source maps are created.
diff --git a/flattentool/__init__.py b/flattentool/__init__.py
@@ -3,9 +3,9 @@
 from flattentool.output import FORMATS as OUTPUT_FORMATS
 from flattentool.output import FORMATS_SUFFIX
 from flattentool.input import FORMATS as INPUT_FORMATS, WITH_CELLS
+from flattentool.lib import decimal_default
 import json
 import codecs
-from decimal import Decimal
 from collections import OrderedDict
 
 
@@ -79,26 +79,6 @@ def spreadsheet_output(spreadsheet_output_class, name):
         raise Exception('The requested format is not available')
 
 
-# From http://bugs.python.org/issue16535
-class NumberStr(float):
-    def __init__(self, o):
-        # We don't call the parent here, since we're deliberately altering it's functionality
-        # pylint: disable=W0231
-        self.o = o
-
-    def __repr__(self):
-        return str(self.o)
-
-    # This is needed for this trick to work in python 3.4
-    def __float__(self):
-        return self
-
-
-def decimal_default(o):
-    if isinstance(o, Decimal):
-        return NumberStr(o)
-    raise TypeError(repr(o) + " is not JSON serializable")
-
 
 def unflatten(input_name, base_json=None, input_format=None, output_name='unflattened.json',
               root_list_path='main', encoding='utf8', timezone_name='UTC',
@@ -132,16 +112,7 @@ def unflatten(input_name, base_json=None, input_format=None, output_name='unflat
     else:
         base = OrderedDict()
     if WITH_CELLS:
-        result, cell_source_map_data, heading_source_map_data = spreadsheet_input.fancy_unflatten()
-        base[root_list_path] = list(result)
-        with codecs.open(output_name, 'w', encoding='utf-8') as fp:
-            json.dump(base, fp, indent=4, default=decimal_default, ensure_ascii=False)
-        if cell_source_map:
-            with codecs.open(cell_source_map, 'w', encoding='utf-8') as fp:
-                json.dump(cell_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
-        if heading_source_map:
-            with codecs.open(heading_source_map, 'w', encoding='utf-8') as fp:
-                json.dump(heading_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
+        spreadsheet_input.fancy_unflatten(base, root_list_path, output_name, cell_source_map, heading_source_map)
     else:
         result = spreadsheet_input.unflatten()
         base[root_list_path] = list(result)
diff --git a/flattentool/input.py b/flattentool/input.py
@@ -8,22 +8,22 @@
 import sys
 from decimal import Decimal, InvalidOperation
 import os
+import codecs
 from collections import OrderedDict
+
 import openpyxl
 from six import text_type
 from warnings import warn
 import traceback
 import datetime
+import json
 import pytz
 from openpyxl.utils import _get_column_letter, column_index_from_string
+from flattentool.lib import decimal_default, Cell
+import tempfile
 
 WITH_CELLS = True
 
-class Cell:
-    def __init__(self, cell_value, cell_location):
-        self.cell_value = cell_value
-        self.cell_location = cell_location
-        self.sub_cells = []
 
 # The "pylint: disable" lines exist to ignore warnings about the imports we expect not to work not working
 
@@ -231,26 +231,41 @@ def inthere(unflattened, id_name):
                 else:
                     main_sheet_by_ocid[root_id_or_none].append(unflattened)
         temporarydicts_to_lists(main_sheet_by_ocid)
+
         return sum(main_sheet_by_ocid.values(), [])
 
+
     def unflatten(self):
-        result = self.do_unflatten()
         if WITH_CELLS:
-            result = extract_list_to_value(result)
-        return result
+            tmp_directory = tempfile.mkdtemp()
+            file_name = os.path.join(tmp_directory, 'unflattened.json')
+            self.results_from_cell_tree({}, 'main', file_name)
+            with open(file_name) as unflattened:
+                return json.load(unflattened, object_pairs_hook=OrderedDict)['main']
+        return self.do_unflatten()
+
+
+    def extract_error_path(self, cell_tree):
+        return sorted(extract_list_to_error_path([self.root_list_path], cell_tree).items())
+
 
-    def fancy_unflatten(self):
+    def results_from_cell_tree(self, base, main_sheet_name, output_name):
+        cell_tree = self.do_unflatten()
+        base[main_sheet_name] = cell_tree
+        with codecs.open(output_name, 'w', encoding='utf-8') as fp:
+            json.dump(base, fp, indent=4, default=decimal_default, ensure_ascii=False)
+        return self.extract_error_path(cell_tree)
+
+
+    def fancy_unflatten(self, base, main_sheet_name, output_name, cell_source_map, heading_source_map):
         if not WITH_CELLS:
             raise Exception('Can only do a fancy_unflatten() if WITH_CELLS=True')
-        cell_tree = self.do_unflatten()
-        result = extract_list_to_value(cell_tree)
-        cell_source_map = extract_list_to_error_path([self.root_list_path], cell_tree)
-        ordered_items = sorted(cell_source_map.items())
-        ordered_cell_source_map = OrderedDict(( '/'.join(str(x) for x in path), location) for path, location in ordered_items)
+        ordered_items = self.results_from_cell_tree(base, main_sheet_name, output_name)
+        if not cell_source_map and not heading_source_map:
+            return
         row_source_map = OrderedDict()
-        heading_source_map = OrderedDict()
-        for path, _ in ordered_items:
-            cells = cell_source_map[path]
+        heading_source_map_data = OrderedDict()
+        for path, cells in ordered_items:
             # Prepare row_source_map key
             key = '/'.join(str(x) for x in path[:-1])
             if not key in row_source_map:
@@ -263,19 +278,28 @@ def fancy_unflatten(self):
                 except:
                     header_path_parts.append(x)
             header_path = '/'.join(header_path_parts)
-            if header_path not in heading_source_map:
-                heading_source_map[header_path] = []
+            if header_path not in heading_source_map_data:
+                heading_source_map_data[header_path] = []
             # Populate the row and header source maps
             for cell in cells:
                 sheet, col, row, header = cell
                 if (sheet, row) not in row_source_map[key]:
                     row_source_map[key].append((sheet, row))
-                if (sheet, header) not in heading_source_map[header_path]:
-                    heading_source_map[header_path].append((sheet, header))
+                if (sheet, header) not in heading_source_map_data[header_path]:
+                    heading_source_map_data[header_path].append((sheet, header))
         for key in row_source_map:
-            assert key not in ordered_cell_source_map, 'Row/cell collision: {}'.format(key)
-            ordered_cell_source_map[key] = row_source_map[key]
-        return result, ordered_cell_source_map, heading_source_map
+            ordered_items.append((key.split('/'), row_source_map[key]))
+
+        if cell_source_map:
+            with codecs.open(cell_source_map, 'w', encoding='utf-8') as fp:
+                json.dump(
+                    OrderedDict(( '/'.join(str(x) for x in path), location) for path, location in ordered_items),
+                    fp, default=decimal_default, ensure_ascii=False, indent=4
+                )
+        if heading_source_map:
+            with codecs.open(heading_source_map, 'w', encoding='utf-8') as fp:
+                json.dump(heading_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
+
 
 def extract_list_to_error_path(path, input):
     output = {}
@@ -310,24 +334,6 @@ def extract_dict_to_error_path(path, input):
             raise Exception('Unexpected result type in the JSON cell tree: {}'.format(input[k]))
     return output
 
-def extract_list_to_value(input):
-    output = []
-    for item in input:
-        output.append(extract_dict_to_value(item))
-    return output
-
-def extract_dict_to_value(input):
-    output = OrderedDict()
-    for k in input:
-        if isinstance(input[k], list):
-            output[k] = extract_list_to_value(input[k])
-        elif isinstance(input[k], dict):
-            output[k] = extract_dict_to_value(input[k])
-        elif isinstance(input[k], Cell):
-            output[k] = input[k].cell_value
-        else:
-            raise Exception('Unexpected result type in the JSON cell tree: {}'.format(input[k]))
-    return output
 
 class CSVInput(SpreadsheetInput):
     encoding = 'utf-8'
@@ -538,6 +544,7 @@ def path_search(nested_dict, path_list, id_fields=None, path=None, top=False, to
 
 
 class TemporaryDict(UserDict):
+    __slots__ = ['keyfield', 'items_no_keyfield', 'data', 'top_sheet']
     def __init__(self, keyfield, top_sheet=False):
         self.keyfield = keyfield
         self.items_no_keyfield = []
diff --git a/flattentool/lib.py b/flattentool/lib.py
@@ -0,0 +1,28 @@
+from decimal import Decimal
+# From http://bugs.python.org/issue16535
+class NumberStr(float):
+    def __init__(self, o):
+        # We don't call the parent here, since we're deliberately altering it's functionality
+        # pylint: disable=W0231
+        self.o = o
+
+    def __repr__(self):
+        return str(self.o)
+
+    # This is needed for this trick to work in python 3.4
+    def __float__(self):
+        return self
+
+class Cell:
+    __slots__ = ['cell_value', 'cell_location', 'sub_cells']
+    def __init__(self, cell_value, cell_location):
+        self.cell_value = cell_value
+        self.cell_location = cell_location
+        self.sub_cells = []
+
+def decimal_default(o):
+    if isinstance(o, Decimal):
+        return NumberStr(o)
+    if isinstance(o, Cell):
+        return o.cell_value
+    raise TypeError(repr(o) + " is not JSON serializable")