diff --git a/dumpgenerator.py b/dumpgenerator.py index 5980a6d4..60bb5034 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1171,6 +1171,12 @@ def getParameters(params=[]): help='store only the current version of pages') groupDownload.add_argument( '--images', action='store_true', help="generates an image dump") + groupDownload.add_argument( + '--resources', + default="html", + choices=["html","dir","warc"], + help="""generate a backup of Main Page as HTML or with resources (CSS, etc.). The dir + and warc options require wget and may leave your IP address in the requisites.""") groupDownload.add_argument( '--namespaces', metavar="1,2,3", @@ -1200,7 +1206,7 @@ def getParameters(params=[]): sys.exit(1) # No download params and no meta info params? Exit - if (not args.xml and not args.images) and \ + if (not args.xml and not args.images and args.resources == 'html') and \ (not args.get_wiki_engine): print 'ERROR: Use at least one download param or meta info param' parser.print_help() @@ -1335,6 +1341,7 @@ def getParameters(params=[]): 'xml': args.xml, 'namespaces': namespaces, 'exnamespaces': exnamespaces, + 'resources': args.resources, 'path': args.path or '', 'cookies': args.cookies or '', 'delay': args.delay @@ -1641,7 +1648,19 @@ def saveSpecialVersion(config={}, session=None): def saveIndexPHP(config={}, session=None): """ Save index.php as .html, to preserve license details available at the botom of the page """ - + escaped_index = "'" + config['index'].replace("'", "'\\''") + "'" + escaped_path = "'" + (config['path'] + '/requisites').replace("'", "'\\''") + "'" + wget_dir = 'wget -e robots=off -p -k -H -nd -P %s --restrict-file-names=windows' + wget_dir %= escaped_path + wget_warc = wget_dir + ' --warc-file=%s' % escaped_path + if config['resources'] == 'warc': + print 'Downloading index.php (Main Page) with all resources to requisites and requisites.warc.gz' + os.system(wget_warc + ' ' + escaped_index) + return + if config['resources'] == 'dir': + print 'Downloading index.php (Main Page) with all resources to requisites' + os.system(wget_dir + ' ' + escaped_index) + return if os.path.exists('%s/index.html' % (config['path'])): print 'index.html exists, do not overwrite' else: