Skip to content

Commit 4824df2

Browse files
committed
Flattening: Reduce memory Footprint.
* Use ijson * Use pyopenxl write_only mode * Store sheet lines in an embedded btree ZODB index #316
1 parent 843dd99 commit 4824df2

10 files changed

+228
-140
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1616

1717
### Fixed
1818

19+
- flattening: Uses much less memory by storing data in a embedded ZODB database, using ijson and using write only mode in pyopenxl.
1920
- use-titles: Use $ref'erring title if available https://github.com/OpenDataServices/flatten-tool/pull/368
2021
- create-template --no-deprecated-fields: Did not work if deprecated element at same level as a $ref https://github.com/OpenDataServices/flatten-tool/issues/185#issuecomment-719587348
2122

flattentool/__init__.py

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def flatten(
112112
else:
113113
schema_parser = None
114114

115-
parser = JSONParser(
115+
with JSONParser(
116116
json_filename=input_name,
117117
root_list_path=None if root_is_list else root_list_path,
118118
schema_parser=schema_parser,
@@ -126,33 +126,33 @@ def flatten(
126126
preserve_fields=preserve_fields,
127127
remove_empty_schema_columns=remove_empty_schema_columns,
128128
truncation_length=truncation_length,
129-
)
130-
parser.parse()
131-
132-
def spreadsheet_output(spreadsheet_output_class, name):
133-
spreadsheet_output = spreadsheet_output_class(
134-
parser=parser,
135-
main_sheet_name=main_sheet_name,
136-
output_name=name,
137-
sheet_prefix=sheet_prefix,
138-
)
139-
spreadsheet_output.write_sheets()
140-
141-
if output_format == "all":
142-
if not output_name:
143-
output_name = "flattened"
144-
for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items():
145-
spreadsheet_output(
146-
spreadsheet_output_class, output_name + FORMATS_SUFFIX[format_name]
129+
persist=True,
130+
) as parser:
131+
132+
def spreadsheet_output(spreadsheet_output_class, name):
133+
spreadsheet_output = spreadsheet_output_class(
134+
parser=parser,
135+
main_sheet_name=main_sheet_name,
136+
output_name=name,
137+
sheet_prefix=sheet_prefix,
147138
)
139+
spreadsheet_output.write_sheets()
140+
141+
if output_format == "all":
142+
if not output_name:
143+
output_name = "flattened"
144+
for format_name, spreadsheet_output_class in OUTPUT_FORMATS.items():
145+
spreadsheet_output(
146+
spreadsheet_output_class, output_name + FORMATS_SUFFIX[format_name]
147+
)
148148

149-
elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats
150-
if not output_name:
151-
output_name = "flattened" + FORMATS_SUFFIX[output_format]
152-
spreadsheet_output(OUTPUT_FORMATS[output_format], output_name)
149+
elif output_format in OUTPUT_FORMATS.keys(): # in dictionary of allowed formats
150+
if not output_name:
151+
output_name = "flattened" + FORMATS_SUFFIX[output_format]
152+
spreadsheet_output(OUTPUT_FORMATS[output_format], output_name)
153153

154-
else:
155-
raise Exception("The requested format is not available")
154+
else:
155+
raise Exception("The requested format is not available")
156156

157157

158158
# From http://bugs.python.org/issue16535

flattentool/json_input.py

Lines changed: 83 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,24 @@
77

88
import codecs
99
import copy
10-
import json
1110
import os
11+
import tempfile
12+
import uuid
1213
from collections import OrderedDict
1314
from decimal import Decimal
1415
from warnings import warn
1516

17+
import BTrees.OOBTree
18+
import ijson
19+
import transaction
1620
import xmltodict
21+
import zc.zlibstorage
22+
import ZODB.FileStorage
1723

1824
from flattentool.i18n import _
1925
from flattentool.input import path_search
2026
from flattentool.schema import make_sub_sheet_name
21-
from flattentool.sheet import Sheet
27+
from flattentool.sheet import PersistentSheet
2228

2329
BASIC_TYPES = [str, bool, int, Decimal, type(None)]
2430

@@ -112,9 +118,26 @@ def __init__(
112118
remove_empty_schema_columns=False,
113119
rollup=False,
114120
truncation_length=3,
121+
persist=False,
115122
):
123+
if persist:
124+
self.zodb_db_location = (
125+
tempfile.gettempdir() + "/flattentool-" + str(uuid.uuid4())
126+
)
127+
zodb_storage = zc.zlibstorage.ZlibStorage(
128+
ZODB.FileStorage.FileStorage(self.zodb_db_location)
129+
)
130+
self.db = ZODB.DB(zodb_storage)
131+
else:
132+
# If None, in memory storage is used.
133+
self.db = ZODB.DB(None)
134+
135+
self.connection = self.db.open()
136+
root = self.connection.root
137+
root.sheet_store = BTrees.OOBTree.BTree()
138+
116139
self.sub_sheets = {}
117-
self.main_sheet = Sheet()
140+
self.main_sheet = PersistentSheet(connection=self.connection, name="")
118141
self.root_list_path = root_list_path
119142
self.root_id = root_id
120143
self.use_titles = use_titles
@@ -125,9 +148,17 @@ def __init__(
125148
self.filter_value = filter_value
126149
self.remove_empty_schema_columns = remove_empty_schema_columns
127150
self.seen_paths = set()
151+
self.persist = persist
128152

129153
if schema_parser:
130-
self.main_sheet = copy.deepcopy(schema_parser.main_sheet)
154+
self.main_sheet = PersistentSheet.from_sheet(
155+
schema_parser.main_sheet, self.connection
156+
)
157+
for sheet_name, sheet in list(self.sub_sheets.items()):
158+
self.sub_sheets[sheet_name] = PersistentSheet.from_sheet(
159+
sheet, self.connection
160+
)
161+
131162
self.sub_sheets = copy.deepcopy(schema_parser.sub_sheets)
132163
if remove_empty_schema_columns:
133164
# Don't use columns from the schema parser
@@ -194,18 +225,13 @@ def __init__(
194225
_("Only one of json_file or root_json_dict should be supplied")
195226
)
196227

197-
if json_filename:
198-
with codecs.open(json_filename, encoding="utf-8") as json_file:
199-
try:
200-
self.root_json_dict = json.load(
201-
json_file, object_pairs_hook=OrderedDict, parse_float=Decimal
202-
)
203-
except UnicodeError as err:
204-
raise BadlyFormedJSONErrorUTF8(*err.args)
205-
except ValueError as err:
206-
raise BadlyFormedJSONError(*err.args)
207-
else:
208-
self.root_json_dict = root_json_dict
228+
if not json_filename:
229+
if self.root_list_path is None:
230+
self.root_json_list = root_json_dict
231+
else:
232+
self.root_json_list = path_search(
233+
root_json_dict, self.root_list_path.split("/")
234+
)
209235

210236
if preserve_fields:
211237
# Extract fields to be preserved from input file (one path per line)
@@ -240,19 +266,37 @@ def __init__(
240266
self.preserve_fields = None
241267
self.preserve_fields_input = None
242268

269+
if json_filename:
270+
if self.root_list_path is None:
271+
path = "item"
272+
else:
273+
path = root_list_path.replace("/", ".") + ".item"
274+
275+
json_file = codecs.open(json_filename, encoding="utf-8")
276+
277+
self.root_json_list = ijson.items(json_file, path, map_type=OrderedDict)
278+
279+
try:
280+
self.parse()
281+
except ijson.common.IncompleteJSONError as err:
282+
raise BadlyFormedJSONError(*err.args)
283+
except UnicodeDecodeError as err:
284+
raise BadlyFormedJSONErrorUTF8(*err.args)
285+
finally:
286+
if json_filename:
287+
json_file.close()
288+
243289
def parse(self):
244-
if self.root_list_path is None:
245-
root_json_list = self.root_json_dict
246-
else:
247-
root_json_list = path_search(
248-
self.root_json_dict, self.root_list_path.split("/")
249-
)
250-
for json_dict in root_json_list:
290+
for num, json_dict in enumerate(self.root_json_list):
251291
if json_dict is None:
252292
# This is particularly useful for IATI XML, in order to not
253293
# fall over on empty activity, e.g. <iati-activity/>
254294
continue
255295
self.parse_json_dict(json_dict, sheet=self.main_sheet)
296+
if num % 2000 == 0 and num != 0:
297+
transaction.commit()
298+
299+
transaction.commit()
256300

257301
if self.remove_empty_schema_columns:
258302
# Remove sheets with no lines of data
@@ -501,7 +545,9 @@ def parse_json_dict(
501545
parent_name, key, truncation_length=self.truncation_length
502546
)
503547
if sub_sheet_name not in self.sub_sheets:
504-
self.sub_sheets[sub_sheet_name] = Sheet(name=sub_sheet_name)
548+
self.sub_sheets[sub_sheet_name] = PersistentSheet(
549+
name=sub_sheet_name, connection=self.connection
550+
)
505551

506552
for json_dict in value:
507553
if json_dict is None:
@@ -518,4 +564,16 @@ def parse_json_dict(
518564
raise ValueError(_("Unsupported type {}").format(type(value)))
519565

520566
if top:
521-
sheet.lines.append(flattened_dict)
567+
sheet.append_line(flattened_dict)
568+
569+
def __enter__(self):
570+
return self
571+
572+
def __exit__(self, type, value, traceback):
573+
if self.persist:
574+
self.connection.close()
575+
self.db.close()
576+
os.remove(self.zodb_db_location)
577+
os.remove(self.zodb_db_location + ".lock")
578+
os.remove(self.zodb_db_location + ".index")
579+
os.remove(self.zodb_db_location + ".tmp")

flattentool/output.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def close(self):
5050

5151
class XLSXOutput(SpreadsheetOutput):
5252
def open(self):
53-
self.workbook = openpyxl.Workbook()
53+
self.workbook = openpyxl.Workbook(write_only=True)
5454

5555
def write_sheet(self, sheet_name, sheet):
5656
sheet_header = list(sheet)
@@ -75,7 +75,6 @@ def write_sheet(self, sheet_name, sheet):
7575
worksheet.append(line)
7676

7777
def close(self):
78-
self.workbook.remove(self.workbook.active)
7978
self.workbook.save(self.output_name)
8079

8180

flattentool/sheet.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
import copy
2+
3+
import BTrees.IOBTree
4+
5+
16
class Sheet(object):
27
"""
38
An abstract representation of a single sheet of a spreadsheet.
@@ -8,10 +13,14 @@ def __init__(self, columns=None, root_id="", name=None):
813
self.id_columns = []
914
self.columns = columns if columns else []
1015
self.titles = {}
11-
self.lines = []
16+
self._lines = []
1217
self.root_id = root_id
1318
self.name = name
1419

20+
@property
21+
def lines(self):
22+
return self._lines
23+
1524
def add_field(self, field, id_field=False):
1625
columns = self.id_columns if id_field else self.columns
1726
if field not in columns:
@@ -27,3 +36,39 @@ def __iter__(self):
2736
yield column
2837
for column in self.columns:
2938
yield column
39+
40+
def append_line(self, flattened_dict):
41+
self._lines.append(flattened_dict)
42+
43+
44+
class PersistentSheet(Sheet):
45+
"""
46+
A sheet that is persisted in ZODB database.
47+
48+
"""
49+
50+
def __init__(self, columns=None, root_id="", name=None, connection=None):
51+
super().__init__(columns=columns, root_id=root_id, name=name)
52+
self.connection = connection
53+
self.index = 0
54+
connection.root.sheet_store[self.name] = BTrees.IOBTree.BTree()
55+
56+
@property
57+
def lines(self):
58+
for key, value in self.connection.root.sheet_store[self.name].items():
59+
if key % 5000 == 0:
60+
self.connection.cacheMinimize()
61+
yield value
62+
63+
def append_line(self, flattened_dict):
64+
self.connection.root.sheet_store[self.name][self.index] = flattened_dict
65+
self.index += 1
66+
67+
@classmethod
68+
def from_sheet(cls, sheet, connection):
69+
instance = cls(name=sheet.name, connection=connection)
70+
instance.id_columns = copy.deepcopy(sheet.id_columns)
71+
instance.columns = copy.deepcopy(sheet.columns)
72+
instance.titles = copy.deepcopy(sheet.titles)
73+
instance.root_id = sheet.root_id
74+
return instance

0 commit comments

Comments
 (0)