Skip to content

Commit d1fe366

Browse files
kindlyBjwebb
authored andcommitted
[OpenDataServices/cove#418] Reduce memory footprint
Move around some things to stop data being copied. Reduce memory for when no source maps are created.
1 parent d6938b5 commit d1fe366

File tree

3 files changed

+79
-73
lines changed

3 files changed

+79
-73
lines changed

flattentool/__init__.py

+2-31
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
from flattentool.output import FORMATS as OUTPUT_FORMATS
44
from flattentool.output import FORMATS_SUFFIX
55
from flattentool.input import FORMATS as INPUT_FORMATS, WITH_CELLS
6+
from flattentool.lib import decimal_default
67
import json
78
import codecs
8-
from decimal import Decimal
99
from collections import OrderedDict
1010

1111

@@ -79,26 +79,6 @@ def spreadsheet_output(spreadsheet_output_class, name):
7979
raise Exception('The requested format is not available')
8080

8181

82-
# From http://bugs.python.org/issue16535
83-
class NumberStr(float):
84-
def __init__(self, o):
85-
# We don't call the parent here, since we're deliberately altering it's functionality
86-
# pylint: disable=W0231
87-
self.o = o
88-
89-
def __repr__(self):
90-
return str(self.o)
91-
92-
# This is needed for this trick to work in python 3.4
93-
def __float__(self):
94-
return self
95-
96-
97-
def decimal_default(o):
98-
if isinstance(o, Decimal):
99-
return NumberStr(o)
100-
raise TypeError(repr(o) + " is not JSON serializable")
101-
10282

10383
def unflatten(input_name, base_json=None, input_format=None, output_name='unflattened.json',
10484
root_list_path='main', encoding='utf8', timezone_name='UTC',
@@ -132,16 +112,7 @@ def unflatten(input_name, base_json=None, input_format=None, output_name='unflat
132112
else:
133113
base = OrderedDict()
134114
if WITH_CELLS:
135-
result, cell_source_map_data, heading_source_map_data = spreadsheet_input.fancy_unflatten()
136-
base[root_list_path] = list(result)
137-
with codecs.open(output_name, 'w', encoding='utf-8') as fp:
138-
json.dump(base, fp, indent=4, default=decimal_default, ensure_ascii=False)
139-
if cell_source_map:
140-
with codecs.open(cell_source_map, 'w', encoding='utf-8') as fp:
141-
json.dump(cell_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
142-
if heading_source_map:
143-
with codecs.open(heading_source_map, 'w', encoding='utf-8') as fp:
144-
json.dump(heading_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
115+
spreadsheet_input.fancy_unflatten(base, root_list_path, output_name, cell_source_map, heading_source_map)
145116
else:
146117
result = spreadsheet_input.unflatten()
147118
base[root_list_path] = list(result)

flattentool/input.py

+49-42
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,22 @@
88
import sys
99
from decimal import Decimal, InvalidOperation
1010
import os
11+
import codecs
1112
from collections import OrderedDict
13+
1214
import openpyxl
1315
from six import text_type
1416
from warnings import warn
1517
import traceback
1618
import datetime
19+
import json
1720
import pytz
1821
from openpyxl.utils import _get_column_letter, column_index_from_string
22+
from flattentool.lib import decimal_default, Cell
23+
import tempfile
1924

2025
WITH_CELLS = True
2126

22-
class Cell:
23-
def __init__(self, cell_value, cell_location):
24-
self.cell_value = cell_value
25-
self.cell_location = cell_location
26-
self.sub_cells = []
2727

2828
# The "pylint: disable" lines exist to ignore warnings about the imports we expect not to work not working
2929

@@ -231,26 +231,41 @@ def inthere(unflattened, id_name):
231231
else:
232232
main_sheet_by_ocid[root_id_or_none].append(unflattened)
233233
temporarydicts_to_lists(main_sheet_by_ocid)
234+
234235
return sum(main_sheet_by_ocid.values(), [])
235236

237+
236238
def unflatten(self):
237-
result = self.do_unflatten()
238239
if WITH_CELLS:
239-
result = extract_list_to_value(result)
240-
return result
240+
tmp_directory = tempfile.mkdtemp()
241+
file_name = os.path.join(tmp_directory, 'unflattened.json')
242+
self.results_from_cell_tree({}, 'main', file_name)
243+
with open(file_name) as unflattened:
244+
return json.load(unflattened, object_pairs_hook=OrderedDict)['main']
245+
return self.do_unflatten()
246+
247+
248+
def extract_error_path(self, cell_tree):
249+
return sorted(extract_list_to_error_path([self.root_list_path], cell_tree).items())
250+
241251

242-
def fancy_unflatten(self):
252+
def results_from_cell_tree(self, base, main_sheet_name, output_name):
253+
cell_tree = self.do_unflatten()
254+
base[main_sheet_name] = cell_tree
255+
with codecs.open(output_name, 'w', encoding='utf-8') as fp:
256+
json.dump(base, fp, indent=4, default=decimal_default, ensure_ascii=False)
257+
return self.extract_error_path(cell_tree)
258+
259+
260+
def fancy_unflatten(self, base, main_sheet_name, output_name, cell_source_map, heading_source_map):
243261
if not WITH_CELLS:
244262
raise Exception('Can only do a fancy_unflatten() if WITH_CELLS=True')
245-
cell_tree = self.do_unflatten()
246-
result = extract_list_to_value(cell_tree)
247-
cell_source_map = extract_list_to_error_path([self.root_list_path], cell_tree)
248-
ordered_items = sorted(cell_source_map.items())
249-
ordered_cell_source_map = OrderedDict(( '/'.join(str(x) for x in path), location) for path, location in ordered_items)
263+
ordered_items = self.results_from_cell_tree(base, main_sheet_name, output_name)
264+
if not cell_source_map and not heading_source_map:
265+
return
250266
row_source_map = OrderedDict()
251-
heading_source_map = OrderedDict()
252-
for path, _ in ordered_items:
253-
cells = cell_source_map[path]
267+
heading_source_map_data = OrderedDict()
268+
for path, cells in ordered_items:
254269
# Prepare row_source_map key
255270
key = '/'.join(str(x) for x in path[:-1])
256271
if not key in row_source_map:
@@ -263,19 +278,28 @@ def fancy_unflatten(self):
263278
except:
264279
header_path_parts.append(x)
265280
header_path = '/'.join(header_path_parts)
266-
if header_path not in heading_source_map:
267-
heading_source_map[header_path] = []
281+
if header_path not in heading_source_map_data:
282+
heading_source_map_data[header_path] = []
268283
# Populate the row and header source maps
269284
for cell in cells:
270285
sheet, col, row, header = cell
271286
if (sheet, row) not in row_source_map[key]:
272287
row_source_map[key].append((sheet, row))
273-
if (sheet, header) not in heading_source_map[header_path]:
274-
heading_source_map[header_path].append((sheet, header))
288+
if (sheet, header) not in heading_source_map_data[header_path]:
289+
heading_source_map_data[header_path].append((sheet, header))
275290
for key in row_source_map:
276-
assert key not in ordered_cell_source_map, 'Row/cell collision: {}'.format(key)
277-
ordered_cell_source_map[key] = row_source_map[key]
278-
return result, ordered_cell_source_map, heading_source_map
291+
ordered_items.append((key.split('/'), row_source_map[key]))
292+
293+
if cell_source_map:
294+
with codecs.open(cell_source_map, 'w', encoding='utf-8') as fp:
295+
json.dump(
296+
OrderedDict(( '/'.join(str(x) for x in path), location) for path, location in ordered_items),
297+
fp, default=decimal_default, ensure_ascii=False, indent=4
298+
)
299+
if heading_source_map:
300+
with codecs.open(heading_source_map, 'w', encoding='utf-8') as fp:
301+
json.dump(heading_source_map_data, fp, indent=4, default=decimal_default, ensure_ascii=False)
302+
279303

280304
def extract_list_to_error_path(path, input):
281305
output = {}
@@ -310,24 +334,6 @@ def extract_dict_to_error_path(path, input):
310334
raise Exception('Unexpected result type in the JSON cell tree: {}'.format(input[k]))
311335
return output
312336

313-
def extract_list_to_value(input):
314-
output = []
315-
for item in input:
316-
output.append(extract_dict_to_value(item))
317-
return output
318-
319-
def extract_dict_to_value(input):
320-
output = OrderedDict()
321-
for k in input:
322-
if isinstance(input[k], list):
323-
output[k] = extract_list_to_value(input[k])
324-
elif isinstance(input[k], dict):
325-
output[k] = extract_dict_to_value(input[k])
326-
elif isinstance(input[k], Cell):
327-
output[k] = input[k].cell_value
328-
else:
329-
raise Exception('Unexpected result type in the JSON cell tree: {}'.format(input[k]))
330-
return output
331337

332338
class CSVInput(SpreadsheetInput):
333339
encoding = 'utf-8'
@@ -538,6 +544,7 @@ def path_search(nested_dict, path_list, id_fields=None, path=None, top=False, to
538544

539545

540546
class TemporaryDict(UserDict):
547+
__slots__ = ['keyfield', 'items_no_keyfield', 'data', 'top_sheet']
541548
def __init__(self, keyfield, top_sheet=False):
542549
self.keyfield = keyfield
543550
self.items_no_keyfield = []

flattentool/lib.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from decimal import Decimal
2+
# From http://bugs.python.org/issue16535
3+
class NumberStr(float):
4+
def __init__(self, o):
5+
# We don't call the parent here, since we're deliberately altering it's functionality
6+
# pylint: disable=W0231
7+
self.o = o
8+
9+
def __repr__(self):
10+
return str(self.o)
11+
12+
# This is needed for this trick to work in python 3.4
13+
def __float__(self):
14+
return self
15+
16+
class Cell:
17+
__slots__ = ['cell_value', 'cell_location', 'sub_cells']
18+
def __init__(self, cell_value, cell_location):
19+
self.cell_value = cell_value
20+
self.cell_location = cell_location
21+
self.sub_cells = []
22+
23+
def decimal_default(o):
24+
if isinstance(o, Decimal):
25+
return NumberStr(o)
26+
if isinstance(o, Cell):
27+
return o.cell_value
28+
raise TypeError(repr(o) + " is not JSON serializable")

0 commit comments

Comments
 (0)