From 3ab24cfd27ef9ab109f2f2e7744d943228d2cf70 Mon Sep 17 00:00:00 2001 From: PiRSquared17 Date: Mon, 2 Mar 2015 23:47:35 +0000 Subject: [PATCH 1/2] Allow saving resources (work-in-progress) --- dumpgenerator.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 5980a6d4..a21022da 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1171,6 +1171,12 @@ def getParameters(params=[]): help='store only the current version of pages') groupDownload.add_argument( '--images', action='store_true', help="generates an image dump") + groupDownload.add_argument( + '--resources', + default="html", + choices=["html","dir","warc"], + help="""generate a backup of Main Page as HTML or with resources (CSS, etc.). The dir + and warc options require wget and may leave your IP address in the requisites.""") groupDownload.add_argument( '--namespaces', metavar="1,2,3", @@ -1335,6 +1341,7 @@ def getParameters(params=[]): 'xml': args.xml, 'namespaces': namespaces, 'exnamespaces': exnamespaces, + 'resources': args.resources, 'path': args.path or '', 'cookies': args.cookies or '', 'delay': args.delay @@ -1641,7 +1648,19 @@ def saveSpecialVersion(config={}, session=None): def saveIndexPHP(config={}, session=None): """ Save index.php as .html, to preserve license details available at the botom of the page """ - + escaped_index = "'" + config['index'].replace("'", "'\\''") + "'" + escaped_path = "'" + (config['path'] + '/requisites').replace("'", "'\\''") + "'" + wget_dir = 'wget -e robots=off -p -k -H -nd -P %s --restrict-file-names=windows' + wget_dir %= escaped_path + wget_warc = wget_dir + ' --warc-file=%s' % escaped_path + if config['resources'] == 'warc': + print 'Downloading index.php (Main Page) with all resources to requisites and requisites.warc.gz' + os.system(wget_warc + ' ' + escaped_index) + return + if config['resources'] == 'dir': + print 'Downloading index.php (Main Page) with all resources to requisites' + os.system(wget_dir + ' ' + escaped_index) + return if os.path.exists('%s/index.html' % (config['path'])): print 'index.html exists, do not overwrite' else: From 4247f3c026c422dc7710f1efd3bd550b8185f1ab Mon Sep 17 00:00:00 2001 From: PiRSquared17 Date: Mon, 2 Mar 2015 23:50:36 +0000 Subject: [PATCH 2/2] If non-default argument provided, allow saving resources only --- dumpgenerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index a21022da..60bb5034 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1206,7 +1206,7 @@ def getParameters(params=[]): sys.exit(1) # No download params and no meta info params? Exit - if (not args.xml and not args.images) and \ + if (not args.xml and not args.images and args.resources == 'html') and \ (not args.get_wiki_engine): print 'ERROR: Use at least one download param or meta info param' parser.print_help()