Skip to content

Commit 123d981

Browse files
committed
Flattening: Add comments per review
#316
1 parent 4824df2 commit 123d981

File tree

4 files changed

+18
-0
lines changed

4 files changed

+18
-0
lines changed

flattentool/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ def flatten(
112112
else:
113113
schema_parser = None
114114

115+
# context manager to clean up ZODB database when it exits
115116
with JSONParser(
116117
json_filename=input_name,
117118
root_list_path=None if root_is_list else root_list_path,

flattentool/json_input.py

+11
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,11 @@ def __init__(
121121
persist=False,
122122
):
123123
if persist:
124+
# Use temp directories in OS agnostic way
124125
self.zodb_db_location = (
125126
tempfile.gettempdir() + "/flattentool-" + str(uuid.uuid4())
126127
)
128+
# zlibstorage lowers disk usage by a lot at very small performance cost
127129
zodb_storage = zc.zlibstorage.ZlibStorage(
128130
ZODB.FileStorage.FileStorage(self.zodb_db_location)
129131
)
@@ -133,7 +135,10 @@ def __init__(
133135
self.db = ZODB.DB(None)
134136

135137
self.connection = self.db.open()
138+
139+
# ZODB root, only objects attached here will be persisted
136140
root = self.connection.root
141+
# OOBTree means a btree with keys and values are objects (including strings)
137142
root.sheet_store = BTrees.OOBTree.BTree()
138143

139144
self.sub_sheets = {}
@@ -151,6 +156,8 @@ def __init__(
151156
self.persist = persist
152157

153158
if schema_parser:
159+
# schema parser does not make sheets that are persistant,
160+
# so use from_sheets which deep copies everything in it.
154161
self.main_sheet = PersistentSheet.from_sheet(
155162
schema_parser.main_sheet, self.connection
156163
)
@@ -293,9 +300,13 @@ def parse(self):
293300
# fall over on empty activity, e.g. <iati-activity/>
294301
continue
295302
self.parse_json_dict(json_dict, sheet=self.main_sheet)
303+
# only persist every 2000 objects. peristing more often slows down storing.
304+
# 2000 top level objects normally not too much to store in memory.
296305
if num % 2000 == 0 and num != 0:
297306
transaction.commit()
298307

308+
# This commit could be removed which would mean that upto 2000 objects
309+
# could be stored in memory without anything being persisted.
299310
transaction.commit()
300311

301312
if self.remove_empty_schema_columns:

flattentool/output.py

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def close(self):
5050

5151
class XLSXOutput(SpreadsheetOutput):
5252
def open(self):
53+
# write only means that the output will be streamed
5354
self.workbook = openpyxl.Workbook(write_only=True)
5455

5556
def write_sheet(self, sheet_name, sheet):

flattentool/sheet.py

+5
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,16 @@ def __init__(self, columns=None, root_id="", name=None, connection=None):
5151
super().__init__(columns=columns, root_id=root_id, name=name)
5252
self.connection = connection
5353
self.index = 0
54+
# Integer key and object value btree. Store sequential index in order to preserve input order.
5455
connection.root.sheet_store[self.name] = BTrees.IOBTree.BTree()
5556

5657
@property
5758
def lines(self):
59+
# btrees iterate in key order.
5860
for key, value in self.connection.root.sheet_store[self.name].items():
61+
# 5000 chosen by trial and error. The written row
62+
# data is removed from memory as is no loner needed.
63+
# All new sheets clear out previous sheets data from memory.
5964
if key % 5000 == 0:
6065
self.connection.cacheMinimize()
6166
yield value

0 commit comments

Comments
 (0)