diff --git a/dumpgenerator.py b/dumpgenerator.py index 41dcb53f..220cabfc 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -573,19 +573,31 @@ def getXMLPage(config={}, title='', verbose=True, session=None): if 'templates' in config and config['templates']: params['templates'] = 1 - xml = getXMLPageCore(params=params, config=config, session=session) - if xml == "": - raise ExportAbortedError(config['index']) - if not "" in xml: - raise PageMissingError(params['title'], xml) - else: - # strip these sha1s sums which keep showing up in the export and - # which are invalid for the XML schema (they only apply to - # revisions) - xml = re.sub(r'\n\s*\w+\s*\n', r'\n', xml) - xml = re.sub(r'\n\s*\s*\n', r'\n', xml) + while True: + try: + xml = getXMLPageCore(params=params, config=config, session=session) + if xml == "": + raise ExportAbortedError(config['index']) + if "" not in xml: + raise PageMissingError(params['title'], xml) + else: + # do the split before the regexes because .split throws a + # MemoryError if it runs out of memory, regexes just kills the + # process outright, this lets us download larger pages + xml = xml.split("")[0] + + # strip these sha1s sums which keep showing up in the export and + # which are invalid for the XML schema (they only apply to + # revisions) + xml = re.sub(r'\n\s*\w+\s*\n', r'\n', xml) + xml = re.sub(r'\n\s*\s*\n', r'\n', xml) + break + except MemoryError: + print "The page's history exceeds our memory, halving limit." + params['limit'] = params['limit'] / 2 + continue - yield xml.split("")[0] + yield xml # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available # else, warning about Special:Export truncating large page histories