diff --git a/.gitattributes b/.gitattributes index 80e1ac6a..da62a380 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,12 @@ *.com linguist-vendored *.org linguist-vendored + +*.py text=auto +*.sh text=auto +*.json text=auto +*.txt text=auto +*.md text=auto + +*.html linguist-detectable=false + +wikiteam3/dumpgenerator/test/data/* linguist-vendored diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml new file mode 100644 index 00000000..6d527718 --- /dev/null +++ b/.github/workflows/nix.yml @@ -0,0 +1,34 @@ +name: Nix Build and Check + +on: + push: + branches: [ main, v4-main ] + pull_request: + branches: [ main, v4-main ] + +jobs: + nix-build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install Nix + uses: cachix/install-nix-action@v27 + with: + extra_nix_config: | + experimental-features = nix-command flakes + + - name: Check flake + run: nix flake check + + - name: Build package + run: nix build + + - name: Test wikiteam3dumpgenerator + run: | + nix run . -- --help + + - name: Verify reproducibility + run: | + nix build --rebuild + nix path-info --json | jq -r '.[].narHash' diff --git a/.github/workflows/test-dumogenerator.yml b/.github/workflows/test-dumogenerator.yml new file mode 100644 index 00000000..50c0e162 --- /dev/null +++ b/.github/workflows/test-dumogenerator.yml @@ -0,0 +1,43 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: dumogenerator test + +on: + push: + pull_request: + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.12", "3.14"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y libxml2-dev libxslt-dev # for lxml + python -m pip install --upgrade pip + python -m pip install flake8 pytest + pip install . + - name: Lint with flake8 + run: | + # exit if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: run dumpgenerator + run: | + python -m wikiteam3.dumpgenerator -h + - name: Test with pytest + run: | + pytest diff --git a/.gitignore b/.gitignore index 2680feb1..9cccda88 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,15 @@ *.pyc -testing/*.pyc -testing/dumpgenerator.py -/.tox +.pytest_cache keys.txt batchdownload/keys.txt batchdownload/dumpgenerator.py batchdownload/uploader.py +__pycache__ +tests/tmp +dist/ +.DS_Store +desktop.ini +.venv +.vscode +.idea diff --git a/.pdm-python b/.pdm-python new file mode 100644 index 00000000..0a061173 --- /dev/null +++ b/.pdm-python @@ -0,0 +1 @@ +/home/yzqzss/git/wikiteam3/.venv/bin/python \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index e229d0f6..00000000 --- a/.travis.yml +++ /dev/null @@ -1,8 +0,0 @@ -language: python -python: 2.7 -install: - - pip install tox -script: - - tox -notifications: - email: false diff --git a/DEV.md b/DEV.md new file mode 100644 index 00000000..8f9a52ea --- /dev/null +++ b/DEV.md @@ -0,0 +1,89 @@ +# WikiTeam3 internal + +## Images.txt structure + +```python +filename + "\t" + url + "\t" + uploader ++ "\t" + (str(size) if size else NULL) ++ "\t" + (str(sha1) if sha1 else NULL) ++ "\t" + (timestamp if timestamp else NULL) ++ "\n" +``` + +*optional fields: +- "null" (magic None value, since wikiteam3 v4.0.0) +- "False" (magic None value, before wikiteam3 v4.0.0) +- not present (ancient wikiteam3 versions) + +# Snippets + +## API Output format + +https://www.mediawiki.org/wiki/API:Data_formats#Output + +> The standard and default output format in MediaWiki is JSON. All other formats are discouraged. +> +> The output format should always be specified using format=yourformat with yourformat being one of the following: +> +> json: JSON format. (recommended) +> php: serialized PHP format. (deprecated) +> xml: XML format. (deprecated) +> txt: PHP print_r() format. (removed in 1.27) +> dbg: PHP var_export() format. (removed in 1.27) +> yaml: YAML format. (removed in 1.27) +> wddx: WDDX format. (removed in 1.26) +> dump: PHP var_dump() format. (removed in 1.26) +> none: Returns a blank response. 1.21+ + +In our practice, `json` is not available for some old wikis. + +## Allpages + +https://www.mediawiki.org/wiki/API:Allpages (>= 1.8) + + +## Allimages + +https://www.mediawiki.org/wiki/API:Allimages (>= 1.13) + +## Redirects + +https://www.mediawiki.org/wiki/Manual:Redirect_table + +## Logs + +https://www.mediawiki.org/wiki/Manual:Logging_table + +## Continuation + +https://www.mediawiki.org/wiki/API:Continue (≥ 1.26) +https://www.mediawiki.org/wiki/API:Raw_query_continue (≥ 1.9) + +> From MediaWiki 1.21 to 1.25, it was required to specify continue= (i.e. with an empty string as the value) in the initial request to get continuation data in the format described above. Without doing that, API results would indicate there is additional data by returning a query-continue element, explained in Raw query continue. +> Prior to 1.21, that raw continuation (`query-continue`) was the only option. +> +> If your application needs to use the raw continuation in MediaWiki 1.26 or later, you must specify rawcontinue= to request it. + +# Workarounds + +## truncated API response causes infinite loop + +https://github.com/mediawiki-client-tools/mediawiki-dump-generator/issues/166 +https://phabricator.wikimedia.org/T86611 + +wikiteam3 workaround: https://github.com/saveweb/wikiteam3/commit/76465d34898b80e8c0eb6d9652aa8efa403a7ce7 + +## MWUnknownContentModelException + +> "The content model xxxxxx is not registered on this wiki;" + +Some extensions use custom content models for their own purposes, but they did not register a handler to export their content. + +wikiteam3 workaround: https://github.com/saveweb/wikiteam3/commit/fd5a02a649dcf3bdab7ac1268445b0550130e6ee + +## Insecure SSL + +https://docs.openssl.org/1.1.1/man1/ciphers/ +https://docs.openssl.org/master/man1/openssl-ciphers/ + +wikiteam3 workaround: https://github.com/saveweb/wikiteam3/blob/8a054882de19c6b69bc03798d3044b7b5c4c3c88/wikiteam3/utils/monkey_patch.py#L63-L84 \ No newline at end of file diff --git a/MediaWikiArchive.png b/MediaWikiArchive.png new file mode 100644 index 00000000..08ba9986 Binary files /dev/null and b/MediaWikiArchive.png differ diff --git a/README.md b/README.md index 66165ef2..45842be3 100644 --- a/README.md +++ b/README.md @@ -1,98 +1,370 @@ -# WikiTeam -### We archive wikis, from Wikipedia to tiniest wikis +# `wikiteam3` + +![Dynamic JSON Badge](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Farchive.org%2Fadvancedsearch.php%3Fq%3Dsubject%3Awikiteam3%26rows%3D1%26page%3D1%26output%3Djson&query=%24.response.numFound&label=WikiTeam3%20Dumps%40IA) +[![PyPI version](https://badge.fury.io/py/wikiteam3.svg)](https://badge.fury.io/py/wikiteam3) + + +
+ +> Countless MediaWikis are still waiting to be archived. +> +> _Image by [@gledos](https://github.com/gledos/)_ + +`wikiteam3` is a fork of `mediawiki-scraper`. + +
+ +## Why we fork mediawiki-scraper + +Originally, mediawiki-scraper was named wikiteam3, but wikiteam upstream (py2 version) suggested that the name should be changed to avoid confusion with the original wikiteam. +Half a year later, we didn't see any py3 porting progress in the original wikiteam, and mediawiki-scraper lacks "code" reviewers. +So, we decided to break that suggestion, fork and named it back to wikiteam3, put the code here, and release it to pypi wildly. + +Everything still under GPLv3 license. + +
+ +## For webmaster + +We archive every MediaWiki site yearly and upload to the Internet Archive. +We crawl sites with 1.5s crawl-delay by default, and we respect Retry-After header. +If you don’t want your wiki to be archived, add the following to your `/robots.txt`: + +```robots.txt +User-agent: wikiteam3 +Disallow: / +``` + +Our bots are running on the following IPs: [wikiteam3.txt](https://static.saveweb.org/bots_ips/wikiteam3.txt) (ips+info) | [wikiteam3.ips.txt](https://static.saveweb.org/bots_ips/wikiteam3.ips.txt) (ips) + +## Installation/Upgrade + +```shell +pip install wikiteam3 --upgrade +``` + +>[!NOTE] +> For public MediaWiki, you don't need to install wikiteam3 locally. You can send an archive request (include the reason for the archive request, e.g. wiki is about to shutdown, need a wikidump to migrate to another wikifarm, etc.) to the wikiteam IRC channel. An online member will run a [wikibot](https://wikibot.digitaldragon.dev/) job for your request. +> +> Even more, we also accept DokuWiki and PukiWiki archive requests. +> +> - wikiteam IRC (webirc): +> - wikiteam IRC logs: https://irclogs.archivete.am/wikiteam + +## Dumpgenerator usage + + +
+ +```bash +usage: wikiteam3dumpgenerator [-h] [-v] [--cookies cookies.txt] [--delay 1.5] + [--retries 5] [--hard-retries 3] [--path PATH] + [--resume] [--force] [--user USER] + [--pass PASSWORD] [--http-user HTTP_USER] + [--http-pass HTTP_PASSWORD] + [--http-method {GET,POST}] [--insecure] + [--verbose] [--api_chunksize 50] [--api API] + [--index INDEX] [--index-check-threshold 0.80] + [--xml] [--curonly] [--xmlapiexport] + [--xmlrevisions] [--xmlrevisions_page] + [--redirects] [--namespaces 1,2,3] [--images] + [--bypass-cdn-image-compression] + [--image-timestamp-interval 2019-01-02T01:36:06Z/2023-08-12T10:36:06Z] + [--ia-wbm-booster {0,1,2,3}] + [--assert-max-pages 123] + [--assert-max-edits 123] + [--assert-max-images 123] + [--assert-max-images-bytes 123] + [--get-wiki-engine] [--failfast] [--upload] + [-g UPLOADER_ARGS] + [wiki] + +options: + -h, --help show this help message and exit + -v, --version show program's version number and exit + --cookies cookies.txt + path to a cookies.txt file + --delay 1.5 adds a delay (in seconds) [NOTE: most HTTP servers + have a 5s HTTP/1.1 keep-alive timeout, you should + consider it if you wanna reuse the connection] + --retries 5 Maximum number of retries for each request before + failing. + --hard-retries 3 Maximum number of hard retries for each request before + failing. (for now, this only controls the hard retries + during images downloading) + --path PATH path to store wiki dump at + --resume resumes previous incomplete dump (requires --path) + --force download it even if Wikimedia site or a recent dump + exists in the Internet Archive + --user USER Username if MediaWiki authentication is required. + --pass PASSWORD Password if MediaWiki authentication is required. + --http-user HTTP_USER + Username if HTTP authentication is required. + --http-pass HTTP_PASSWORD + Password if HTTP authentication is required. + --http-method {GET,POST} + HTTP method to use when making API requests to the + wiki (default: POST) + --insecure Disable SSL certificate verification + --verbose + --api_chunksize 50 Chunk size for MediaWiki API (arvlimit, ailimit, etc.) + + wiki URL to wiki (e.g. http://wiki.domain.org), auto + detects API and index.php + --api API URL to API (e.g. http://wiki.domain.org/w/api.php) + --index INDEX URL to index.php (e.g. + http://wiki.domain.org/w/index.php), (not supported + with --images on newer(?) MediaWiki without --api) + --index-check-threshold 0.80 + pass index.php check if result is greater than (>) + this value (default: 0.80) + +Data to download: + What info download from the wiki + + --xml Export XML dump using Special:Export (index.php). + (supported with --curonly) + --curonly store only the latest revision of pages + --xmlapiexport Export XML dump using API:revisions instead of + Special:Export, use this when Special:Export fails and + xmlrevisions not supported. (supported with --curonly) + --xmlrevisions Export all revisions from an API generator + (API:Allrevisions). MediaWiki 1.27+ only. (not + supported with --curonly) + --xmlrevisions_page [[! Development only !]] Export all revisions from an + API generator, but query page by page MediaWiki 1.27+ + only. (default: --curonly) + --redirects Dump page redirects via API:Allredirects + --namespaces 1,2,3 comma-separated value of namespaces to include (all by + default) + --images Generates an image dump + +Image dump options: + Options for image dump (--images) + + --bypass-cdn-image-compression + Bypass CDN image compression. (CloudFlare Polish, + etc.) [WARNING: This will increase CDN origin traffic, + and not effective for all HTTP Server/CDN, please + don't use this blindly.] + --image-timestamp-interval 2019-01-02T01:36:06Z/2023-08-12T10:36:06Z + Only download images uploaded in the given time + interval. [format: ISO 8601 UTC interval] (only works + with api) + --ia-wbm-booster {0,1,2,3} + Download images from Internet Archive Wayback Machine + if possible, reduce the bandwidth usage of the wiki. + [0: disabled (default), 1: use earliest snapshot, 2: + use latest snapshot, 3: the closest snapshot to the + image's upload time] + +Assertions: + What assertions to check before actually downloading, if any assertion + fails, program will exit with exit code 45. [NOTE: This feature requires + correct siteinfo API response from the wiki, and not working properly with + some wikis. But it's useful for mass automated archiving, so you can + schedule a re-run for HUGE wiki that may run out of your disk] + + --assert-max-pages 123 + Maximum number of pages to download + --assert-max-edits 123 + Maximum number of edits to download + --assert-max-images 123 + Maximum number of images to download + --assert-max-images-bytes 123 + Maximum number of bytes to download for images [NOTE: + this assert happens after downloading images list] + +Meta info: + What meta info to retrieve from the wiki + + --get-wiki-engine returns the wiki engine + --failfast [lack maintenance] Avoid resuming, discard failing + wikis quickly. Useful only for mass downloads. + +wikiteam3uploader params: + --upload (run `wikiteam3uplaoder` for you) Upload wikidump to + Internet Archive after successfully dumped + -g, --uploader-arg UPLOADER_ARGS + Arguments for uploader. + +``` +
+ + + +### Downloading a wiki with complete XML history and images + +```bash +wikiteam3dumpgenerator http://wiki.domain.org --xml --images +``` + +>[!WARNING] +> +> `NTFS/Windows` users please note: When using `--images`, because NTFS does not allow characters such as `:*?"<>|` in filenames, some files may not be downloaded, please pay attention to the `XXXXX could not be created by OS` error in your `errors.log`. +> We will not make special treatment for NTFS/EncFS "path too long/illegal filename", highly recommend you to use ext4/xfs/btrfs, etc. +>
+> - Introducing the "illegal filename rename" mechanism will bring complexity. WikiTeam(python2) had this before, but it caused more problems, so it was removed in WikiTeam3. +> - It will cause confusion to the final user of wikidump (usually the Wiki site administrator). +> - NTFS is not suitable for large-scale image dump with millions of files in a single directory.(Windows background service will occasionally scan the whole disk, we think there should be no users using WIN/NTFS to do large-scale MediaWiki archive) +> - Using other file systems can solve all problems. +>
+ +### Manually specifying `api.php` and/or `index.php` + +If the script can't find itself the `api.php` and/or `index.php` paths, then you can provide them: + +```bash +wikiteam3dumpgenerator --api http://wiki.domain.org/w/api.php --xml --images +``` + +```bash +wikiteam3dumpgenerator --api http://wiki.domain.org/w/api.php --index http://wiki.domain.org/w/index.php \ + --xml --images +``` -**WikiTeam software is a set of tools for archiving wikis.** They work on MediaWiki wikis, but we want to expand to other wiki engines. As of 2020, WikiTeam has preserved more than [250,000 wikis](https://github.com/WikiTeam/wikiteam/wiki/Available-Backups), several wikifarms, regular Wikipedia dumps and [34 TB of Wikimedia Commons images](https://archive.org/details/wikimediacommons). - -There are [thousands](http://wikiindex.org) of [wikis](https://wikiapiary.com) in the Internet. Every day some of them are no longer publicly available and, due to lack of backups, lost forever. Millions of people download tons of media files (movies, music, books, etc) from the Internet, serving as a kind of distributed backup. Wikis, most of them under free licenses, disappear from time to time because nobody grabbed a copy of them. That is a shame that we would like to solve. - -**WikiTeam** is the [Archive Team](http://www.archiveteam.org) ([GitHub](https://github.com/ArchiveTeam)) subcommittee on wikis. It was founded and originally developed by [Emilio J. Rodríguez-Posada](https://github.com/emijrp), a Wikipedia veteran editor and amateur archivist. Many people have helped by sending suggestions, [reporting bugs](https://github.com/WikiTeam/wikiteam/issues), writing [documentation](https://github.com/WikiTeam/wikiteam/wiki), providing help in the [mailing list](http://groups.google.com/group/wikiteam-discuss) and making [wiki backups](https://github.com/WikiTeam/wikiteam/wiki/Available-Backups). Thanks to all, especially to: [Federico Leva](https://github.com/nemobis), [Alex Buie](https://github.com/ab2525), [Scott Boyd](http://www.sdboyd56.com), [Hydriz](https://github.com/Hydriz), Platonides, Ian McEwen, [Mike Dupont](https://github.com/h4ck3rm1k3), [balr0g](https://github.com/balr0g) and [PiRSquared17](https://github.com/PiRSquared17). - - - -
-Documentation - -Source code - -Download available backups - -Community - -Follow us on Twitter -
- -## Quick guide +If you only want the XML histories, just use `--xml`. For only the images, just `--images`. For only the current version of every page, `--xml --curonly`. -This is a very quick guide for the most used features of WikiTeam tools. For further information, read the [tutorial](https://github.com/WikiTeam/wikiteam/wiki/Tutorial) and the rest of the [documentation](https://github.com/WikiTeam/wikiteam/wiki). You can also ask in the [mailing list](http://groups.google.com/group/wikiteam-discuss). +### Resuming an incomplete dump + +
+ +```bash +wikiteam3dumpgenerator \ + --api http://wiki.domain.org/w/api.php --xml --images --resume --path /path/to/incomplete-dump +``` + +In the above example, `--path` is only necessary if the download path (wikidump dir) is not the default. + +>[!NOTE] +> +> en: When resuming an incomplete dump, the configuration in `config.json` will override the CLI parameters. (But not all CLI parameters will be ignored, check `config.json` for details) + +`wikiteam3dumpgenerator` will also ask you if you want to resume if it finds an incomplete dump in the path where it is downloading. + +
+ +## Using `wikiteam3uploader` + + +
+ +```bash +usage: Upload wikidump to the Internet Archive. [-h] [-kf KEYS_FILE] + [-c COLLECTION] [--dry-run] + [-u] [--bin-zstd BIN_ZSTD] + [--zstd-level {17,18,19,20,21,22}] + [--rezstd] + [--rezstd-endpoint URL] + [--bin-7z BIN_7Z] + [--parallel] + wikidump_dir + +positional arguments: + wikidump_dir + +options: + -h, --help show this help message and exit + -kf, --keys_file KEYS_FILE + Path to the IA S3 keys file. (first line: access key, + second line: secret key) [default: + ~/.wikiteam3_ia_keys.txt] + -c, --collection COLLECTION + --dry-run Dry run, do not upload anything. + -u, --update Update existing item. [!! not implemented yet !!] + --bin-zstd BIN_ZSTD Path to zstd binary. [default: zstd] + --zstd-level {17,18,19,20,21,22} + Zstd compression level. [default: 17] If you have a + lot of RAM, recommend to use max level (22). + --rezstd [server-side recompression] Upload pre-compressed zstd + files to rezstd server for recompression with best + settings (which may eat 10GB+ RAM), then download + back. (This feature saves your lowend machine, lol) + --rezstd-endpoint URL + Rezstd server endpoint. [default: http://pool- + rezstd.saveweb.org/rezstd/] (source code: + https://github.com/yzqzss/rezstd) + --bin-7z BIN_7Z Path to 7z binary. [default: 7z] + --parallel Parallelize compression tasks + +``` +
+ + ### Requirements -Requires Python 2.7. - -Confirm you satisfy the requirements: - -`pip install --upgrade -r requirements.txt` +> [!NOTE] +> +> Please make sure you have the following requirements before using `wikiteam3uploader`, and you don't need to install them if you don't wanna upload the dump to IA. -or, if you don't have enough permissions for the above, +- unbinded localhost port 62954 (for multiple processes compressing queue) +- 3GB+ RAM (~2.56GB for commpressing) +- 64-bit OS (required by 2G `wlog` size) -`pip install --user --upgrade -r requirements.txt` +- `7z` (binary) + > Debian/Ubuntu: install `p7zip-full` -### Download any wiki + > [!NOTE] + > + > Windows: install and add `7z.exe` to PATH +- `zstd` (binary) + > 1.5.5+ (recommended), v1.5.0-v1.5.4(DO NOT USE), 1.4.8 (minimum) + > install from -To download any wiki, use one of the following options: - -`python dumpgenerator.py http://wiki.domain.org --xml --images` (complete XML histories and images) - -If the script can't find itself the API and/or index.php paths, then you can provide them: - -`python dumpgenerator.py --api=http://wiki.domain.org/w/api.php --xml --images` - -`python dumpgenerator.py --api=http://wiki.domain.org/w/api.php --index=http://wiki.domain.org/w/index.php --xml --images` - -If you only want the XML histories, just use `--xml`. For only the images, just `--images`. For only the current version of every page, `--xml --curonly`. + > [!NOTE] + > + > Windows: add `zstd.exe` to PATH -You can resume an aborted download: +### Uploader usage -`python dumpgenerator.py --api=http://wiki.domain.org/w/api.php --xml --images --resume --path=/path/to/incomplete-dump` +> [!NOTE] +> +> Read `wikiteam3uploader --help` and do not forget `~/.wikiteam3_ia_keys.txt` before using `wikiteam3uploader`. -See more options: +```bash +wikiteam3uploader {YOUR_WIKI_DUMP_PATH} +``` -`python dumpgenerator.py --help` +## Checking dump integrity -### Download Wikimedia dumps +TODO: xml2titles.py -To download [Wikimedia XML dumps](http://dumps.wikimedia.org/backup-index.html) (Wikipedia, Wikibooks, Wikinews, etc) you can run: +If you want to check the XML dump integrity, type this into your command line to count title, page and revision XML tags: -`python wikipediadownloader.py` (download all projects) +```bash +grep -E '' *.xml -c; grep -E '' *.xml -c; grep \ + "" *.xml -c;grep -E '' *.xml -c;grep "" *.xml -c +``` -See more options: +You should see something similar to this (not the actual numbers) - the first three numbers should be the same and the last two should be the same as each other: -`python wikipediadownloader.py --help` +```bash +580 +580 +580 +5677 +5677 +``` -### Download Wikimedia Commons images +If your first three numbers or your last two numbers are different, then, your XML dump is corrupt (it contains one or more unfinished `````` or ``````). This is not common in small wikis, but large or very large wikis may fail at this due to truncated XML pages while exporting and merging. The solution is to remove the XML dump and re-download, a bit boring, and it can fail again. -There is a script for this, but we have [uploaded the tarballs](https://archive.org/details/wikimediacommons) to Internet Archive, so it's more useful to reseed their torrents than to re-generate old ones with the script. +## import wikidump to MediaWiki / wikidump data tips -## Developers +> [!IMPORTANT] +> +> In the article name, spaces and underscores are treated as equivalent and each is converted to the other in the appropriate context (underscore in URL and database keys, spaces in plain text). -[![Build Status](https://travis-ci.org/WikiTeam/wikiteam.svg)](https://travis-ci.org/WikiTeam/wikiteam) +> [!NOTE] +> +> `WikiTeam3` uses `zstd` to compress `.xml` and `.txt` files, and `7z` to pack images (media files). +> `zstd` is a very fast stream compression algorithm, you can use `zstd -d` to decompress `.zst` file/steam. -You can run tests easily by using the [tox](https://pypi.python.org/pypi/tox) command. It is probably already present in your operating system, you would need version 1.6. If it is not, you can download it from pypi with: `pip install tox`. +## Contributors -Example usage: +**WikiTeam** is the [Archive Team](http://www.archiveteam.org) [[GitHub](https://github.com/ArchiveTeam)] subcommittee on wikis. +It was founded and originally developed by [Emilio J. Rodríguez-Posada](https://github.com/emijrp), a Wikipedia veteran editor and amateur archivist. Thanks to people who have helped, especially to: [Federico Leva](https://github.com/nemobis), [Alex Buie](https://github.com/ab2525), [Scott Boyd](http://www.sdboyd56.com), [Hydriz](https://github.com/Hydriz), Platonides, Ian McEwen, [Mike Dupont](https://github.com/h4ck3rm1k3), [balr0g](https://github.com/balr0g) and [PiRSquared17](https://github.com/PiRSquared17). - $ tox - py27 runtests: commands[0] | nosetests --nocapture --nologcapture - Checking http://wiki.annotation.jp/api.php - Trying to parse かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg from API - Retrieving image filenames - . Found 266 images - . - ------------------------------------------- - Ran 1 test in 2.253s +**Mediawiki-Scraper** The Python 3 initiative is currently being led by [Elsie Hupp](https://github.com/elsiehupp), with contributions from [Victor Gambier](https://github.com/vgambier), [Thomas Karcher](https://github.com/t-karcher), [Janet Cobb](https://github.com/randomnetcat), [yzqzss](https://github.com/yzqzss), [NyaMisty](https://github.com/NyaMisty) and [Rob Kam](https://github.com/robkam) - OK - _________________ summary _________________ - py27: commands succeeded - congratulations :) - $ +**WikiTeam3** Every archivist who has uploaded a wikidump to the [Internet Archive](https://archive.org/search?query=subject%3Awikiteam3). diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index a68eb484..00000000 --- a/docs/Makefile +++ /dev/null @@ -1,225 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " applehelp to make an Apple Help Book" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " epub3 to make an epub3" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - @echo " coverage to run coverage check of the documentation (if enabled)" - @echo " dummy to check syntax errors of document sources" - -.PHONY: clean -clean: - rm -rf $(BUILDDIR)/* - -.PHONY: html -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -.PHONY: dirhtml -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -.PHONY: singlehtml -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -.PHONY: pickle -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -.PHONY: json -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -.PHONY: htmlhelp -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -.PHONY: qthelp -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/WikiTeam.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/WikiTeam.qhc" - -.PHONY: applehelp -applehelp: - $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp - @echo - @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." - @echo "N.B. You won't be able to view it unless you put it in" \ - "~/Library/Documentation/Help or install it in your application" \ - "bundle." - -.PHONY: devhelp -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/WikiTeam" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/WikiTeam" - @echo "# devhelp" - -.PHONY: epub -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -.PHONY: epub3 -epub3: - $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 - @echo - @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." - -.PHONY: latex -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -.PHONY: latexpdf -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -.PHONY: latexpdfja -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -.PHONY: text -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -.PHONY: man -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -.PHONY: texinfo -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -.PHONY: info -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -.PHONY: gettext -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -.PHONY: changes -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -.PHONY: linkcheck -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -.PHONY: doctest -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -.PHONY: coverage -coverage: - $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage - @echo "Testing of coverage in the sources finished, look at the " \ - "results in $(BUILDDIR)/coverage/python.txt." - -.PHONY: xml -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -.PHONY: pseudoxml -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." - -.PHONY: dummy -dummy: - $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy - @echo - @echo "Build finished. Dummy builder generates no files." diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 174716c5..00000000 --- a/docs/README.md +++ /dev/null @@ -1,3 +0,0 @@ -Read WikiTeam docs in http://wikiteam.readthedocs.io - -Tutorial for Sphinx http://www.sphinx-doc.org/en/stable/tutorial.html diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle deleted file mode 100644 index 1828e9cb..00000000 Binary files a/docs/_build/doctrees/environment.pickle and /dev/null differ diff --git a/docs/_build/doctrees/index.doctree b/docs/_build/doctrees/index.doctree deleted file mode 100644 index fd850676..00000000 Binary files a/docs/_build/doctrees/index.doctree and /dev/null differ diff --git a/docs/_build/html/.buildinfo b/docs/_build/html/.buildinfo deleted file mode 100644 index 593b3b52..00000000 --- a/docs/_build/html/.buildinfo +++ /dev/null @@ -1,4 +0,0 @@ -# Sphinx build info version 1 -# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 4f459140f1d7ccdaeeb53cd50ec0ad6d -tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_build/html/_sources/index.txt b/docs/_build/html/_sources/index.txt deleted file mode 100644 index 8a661bd2..00000000 --- a/docs/_build/html/_sources/index.txt +++ /dev/null @@ -1,23 +0,0 @@ -.. WikiTeam documentation master file, created by - sphinx-quickstart on Sat Jul 30 13:44:21 2016. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to WikiTeam's documentation! -==================================== - -Contents: - -.. toctree:: - :maxdepth: 2 - -.. automodule:: dumpgenerator - :members: - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - diff --git a/docs/_build/html/_static/ajax-loader.gif b/docs/_build/html/_static/ajax-loader.gif deleted file mode 100644 index 61faf8ca..00000000 Binary files a/docs/_build/html/_static/ajax-loader.gif and /dev/null differ diff --git a/docs/_build/html/_static/alabaster.css b/docs/_build/html/_static/alabaster.css deleted file mode 100644 index a88ce299..00000000 --- a/docs/_build/html/_static/alabaster.css +++ /dev/null @@ -1,693 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -@import url("basic.css"); - -/* -- page layout ----------------------------------------------------------- */ - -body { - font-family: 'goudy old style', 'minion pro', 'bell mt', Georgia, 'Hiragino Mincho Pro', serif; - font-size: 17px; - background-color: #fff; - color: #000; - margin: 0; - padding: 0; -} - - -div.document { - width: 940px; - margin: 30px auto 0 auto; -} - -div.documentwrapper { - float: left; - width: 100%; -} - -div.bodywrapper { - margin: 0 0 0 220px; -} - -div.sphinxsidebar { - width: 220px; - font-size: 14px; - line-height: 1.5; -} - -hr { - border: 1px solid #B1B4B6; -} - -div.body { - background-color: #fff; - color: #3E4349; - padding: 0 30px 0 30px; -} - -div.body > .section { - text-align: left; -} - -div.footer { - width: 940px; - margin: 20px auto 30px auto; - font-size: 14px; - color: #888; - text-align: right; -} - -div.footer a { - color: #888; -} - -p.caption { - font-family: inherit; - font-size: inherit; -} - - -div.relations { - display: none; -} - - -div.sphinxsidebar a { - color: #444; - text-decoration: none; - border-bottom: 1px dotted #999; -} - -div.sphinxsidebar a:hover { - border-bottom: 1px solid #999; -} - -div.sphinxsidebarwrapper { - padding: 18px 10px; -} - -div.sphinxsidebarwrapper p.logo { - padding: 0; - margin: -10px 0 0 0px; - text-align: center; -} - -div.sphinxsidebarwrapper h1.logo { - margin-top: -10px; - text-align: center; - margin-bottom: 5px; - text-align: left; -} - -div.sphinxsidebarwrapper h1.logo-name { - margin-top: 0px; -} - -div.sphinxsidebarwrapper p.blurb { - margin-top: 0; - font-style: normal; -} - -div.sphinxsidebar h3, -div.sphinxsidebar h4 { - font-family: 'Garamond', 'Georgia', serif; - color: #444; - font-size: 24px; - font-weight: normal; - margin: 0 0 5px 0; - padding: 0; -} - -div.sphinxsidebar h4 { - font-size: 20px; -} - -div.sphinxsidebar h3 a { - color: #444; -} - -div.sphinxsidebar p.logo a, -div.sphinxsidebar h3 a, -div.sphinxsidebar p.logo a:hover, -div.sphinxsidebar h3 a:hover { - border: none; -} - -div.sphinxsidebar p { - color: #555; - margin: 10px 0; -} - -div.sphinxsidebar ul { - margin: 10px 0; - padding: 0; - color: #000; -} - -div.sphinxsidebar ul li.toctree-l1 > a { - font-size: 120%; -} - -div.sphinxsidebar ul li.toctree-l2 > a { - font-size: 110%; -} - -div.sphinxsidebar input { - border: 1px solid #CCC; - font-family: 'goudy old style', 'minion pro', 'bell mt', Georgia, 'Hiragino Mincho Pro', serif; - font-size: 1em; -} - -div.sphinxsidebar hr { - border: none; - height: 1px; - color: #AAA; - background: #AAA; - - text-align: left; - margin-left: 0; - width: 50%; -} - -/* -- body styles ----------------------------------------------------------- */ - -a { - color: #004B6B; - text-decoration: underline; -} - -a:hover { - color: #6D4100; - text-decoration: underline; -} - -div.body h1, -div.body h2, -div.body h3, -div.body h4, -div.body h5, -div.body h6 { - font-family: 'Garamond', 'Georgia', serif; - font-weight: normal; - margin: 30px 0px 10px 0px; - padding: 0; -} - -div.body h1 { margin-top: 0; padding-top: 0; font-size: 240%; } -div.body h2 { font-size: 180%; } -div.body h3 { font-size: 150%; } -div.body h4 { font-size: 130%; } -div.body h5 { font-size: 100%; } -div.body h6 { font-size: 100%; } - -a.headerlink { - color: #DDD; - padding: 0 4px; - text-decoration: none; -} - -a.headerlink:hover { - color: #444; - background: #EAEAEA; -} - -div.body p, div.body dd, div.body li { - line-height: 1.4em; -} - -div.admonition { - margin: 20px 0px; - padding: 10px 30px; - background-color: #EEE; - border: 1px solid #CCC; -} - -div.admonition tt.xref, div.admonition code.xref, div.admonition a tt { - background-color: ; - border-bottom: 1px solid #fafafa; -} - -dd div.admonition { - margin-left: -60px; - padding-left: 60px; -} - -div.admonition p.admonition-title { - font-family: 'Garamond', 'Georgia', serif; - font-weight: normal; - font-size: 24px; - margin: 0 0 10px 0; - padding: 0; - line-height: 1; -} - -div.admonition p.last { - margin-bottom: 0; -} - -div.highlight { - background-color: #fff; -} - -dt:target, .highlight { - background: #FAF3E8; -} - -div.warning { - background-color: #FCC; - border: 1px solid #FAA; -} - -div.danger { - background-color: #FCC; - border: 1px solid #FAA; - -moz-box-shadow: 2px 2px 4px #D52C2C; - -webkit-box-shadow: 2px 2px 4px #D52C2C; - box-shadow: 2px 2px 4px #D52C2C; -} - -div.error { - background-color: #FCC; - border: 1px solid #FAA; - -moz-box-shadow: 2px 2px 4px #D52C2C; - -webkit-box-shadow: 2px 2px 4px #D52C2C; - box-shadow: 2px 2px 4px #D52C2C; -} - -div.caution { - background-color: #FCC; - border: 1px solid #FAA; -} - -div.attention { - background-color: #FCC; - border: 1px solid #FAA; -} - -div.important { - background-color: #EEE; - border: 1px solid #CCC; -} - -div.note { - background-color: #EEE; - border: 1px solid #CCC; -} - -div.tip { - background-color: #EEE; - border: 1px solid #CCC; -} - -div.hint { - background-color: #EEE; - border: 1px solid #CCC; -} - -div.seealso { - background-color: #EEE; - border: 1px solid #CCC; -} - -div.topic { - background-color: #EEE; -} - -p.admonition-title { - display: inline; -} - -p.admonition-title:after { - content: ":"; -} - -pre, tt, code { - font-family: 'Consolas', 'Menlo', 'Deja Vu Sans Mono', 'Bitstream Vera Sans Mono', monospace; - font-size: 0.9em; -} - -.hll { - background-color: #FFC; - margin: 0 -12px; - padding: 0 12px; - display: block; -} - -img.screenshot { -} - -tt.descname, tt.descclassname, code.descname, code.descclassname { - font-size: 0.95em; -} - -tt.descname, code.descname { - padding-right: 0.08em; -} - -img.screenshot { - -moz-box-shadow: 2px 2px 4px #EEE; - -webkit-box-shadow: 2px 2px 4px #EEE; - box-shadow: 2px 2px 4px #EEE; -} - -table.docutils { - border: 1px solid #888; - -moz-box-shadow: 2px 2px 4px #EEE; - -webkit-box-shadow: 2px 2px 4px #EEE; - box-shadow: 2px 2px 4px #EEE; -} - -table.docutils td, table.docutils th { - border: 1px solid #888; - padding: 0.25em 0.7em; -} - -table.field-list, table.footnote { - border: none; - -moz-box-shadow: none; - -webkit-box-shadow: none; - box-shadow: none; -} - -table.footnote { - margin: 15px 0; - width: 100%; - border: 1px solid #EEE; - background: #FDFDFD; - font-size: 0.9em; -} - -table.footnote + table.footnote { - margin-top: -15px; - border-top: none; -} - -table.field-list th { - padding: 0 0.8em 0 0; -} - -table.field-list td { - padding: 0; -} - -table.field-list p { - margin-bottom: 0.8em; -} - -table.footnote td.label { - width: .1px; - padding: 0.3em 0 0.3em 0.5em; -} - -table.footnote td { - padding: 0.3em 0.5em; -} - -dl { - margin: 0; - padding: 0; -} - -dl dd { - margin-left: 30px; -} - -blockquote { - margin: 0 0 0 30px; - padding: 0; -} - -ul, ol { - /* Matches the 30px from the narrow-screen "li > ul" selector below */ - margin: 10px 0 10px 30px; - padding: 0; -} - -pre { - background: #EEE; - padding: 7px 30px; - margin: 15px 0px; - line-height: 1.3em; -} - -div.viewcode-block:target { - background: #ffd; -} - -dl pre, blockquote pre, li pre { - margin-left: 0; - padding-left: 30px; -} - -dl dl pre { - margin-left: -90px; - padding-left: 90px; -} - -tt, code { - background-color: #ecf0f3; - color: #222; - /* padding: 1px 2px; */ -} - -tt.xref, code.xref, a tt { - background-color: #FBFBFB; - border-bottom: 1px solid #fff; -} - -a.reference { - text-decoration: none; - border-bottom: 1px dotted #004B6B; -} - -/* Don't put an underline on images */ -a.image-reference, a.image-reference:hover { - border-bottom: none; -} - -a.reference:hover { - border-bottom: 1px solid #6D4100; -} - -a.footnote-reference { - text-decoration: none; - font-size: 0.7em; - vertical-align: top; - border-bottom: 1px dotted #004B6B; -} - -a.footnote-reference:hover { - border-bottom: 1px solid #6D4100; -} - -a:hover tt, a:hover code { - background: #EEE; -} - - -@media screen and (max-width: 870px) { - - div.sphinxsidebar { - display: none; - } - - div.document { - width: 100%; - - } - - div.documentwrapper { - margin-left: 0; - margin-top: 0; - margin-right: 0; - margin-bottom: 0; - } - - div.bodywrapper { - margin-top: 0; - margin-right: 0; - margin-bottom: 0; - margin-left: 0; - } - - ul { - margin-left: 0; - } - - li > ul { - /* Matches the 30px from the "ul, ol" selector above */ - margin-left: 30px; - } - - .document { - width: auto; - } - - .footer { - width: auto; - } - - .bodywrapper { - margin: 0; - } - - .footer { - width: auto; - } - - .github { - display: none; - } - - - -} - - - -@media screen and (max-width: 875px) { - - body { - margin: 0; - padding: 20px 30px; - } - - div.documentwrapper { - float: none; - background: #fff; - } - - div.sphinxsidebar { - display: block; - float: none; - width: 102.5%; - margin: 50px -30px -20px -30px; - padding: 10px 20px; - background: #333; - color: #FFF; - } - - div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p, - div.sphinxsidebar h3 a { - color: #fff; - } - - div.sphinxsidebar a { - color: #AAA; - } - - div.sphinxsidebar p.logo { - display: none; - } - - div.document { - width: 100%; - margin: 0; - } - - div.footer { - display: none; - } - - div.bodywrapper { - margin: 0; - } - - div.body { - min-height: 0; - padding: 0; - } - - .rtd_doc_footer { - display: none; - } - - .document { - width: auto; - } - - .footer { - width: auto; - } - - .footer { - width: auto; - } - - .github { - display: none; - } -} - - -/* misc. */ - -.revsys-inline { - display: none!important; -} - -/* Make nested-list/multi-paragraph items look better in Releases changelog - * pages. Without this, docutils' magical list fuckery causes inconsistent - * formatting between different release sub-lists. - */ -div#changelog > div.section > ul > li > p:only-child { - margin-bottom: 0; -} - -/* Hide fugly table cell borders in ..bibliography:: directive output */ -table.docutils.citation, table.docutils.citation td, table.docutils.citation th { - border: none; - /* Below needed in some edge cases; if not applied, bottom shadows appear */ - -moz-box-shadow: none; - -webkit-box-shadow: none; - box-shadow: none; -} \ No newline at end of file diff --git a/docs/_build/html/_static/basic.css b/docs/_build/html/_static/basic.css deleted file mode 100644 index 2b513f0c..00000000 --- a/docs/_build/html/_static/basic.css +++ /dev/null @@ -1,604 +0,0 @@ -/* - * basic.css - * ~~~~~~~~~ - * - * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ - -/* -- main layout ----------------------------------------------------------- */ - -div.clearer { - clear: both; -} - -/* -- relbar ---------------------------------------------------------------- */ - -div.related { - width: 100%; - font-size: 90%; -} - -div.related h3 { - display: none; -} - -div.related ul { - margin: 0; - padding: 0 0 0 10px; - list-style: none; -} - -div.related li { - display: inline; -} - -div.related li.right { - float: right; - margin-right: 5px; -} - -/* -- sidebar --------------------------------------------------------------- */ - -div.sphinxsidebarwrapper { - padding: 10px 5px 0 10px; -} - -div.sphinxsidebar { - float: left; - width: 230px; - margin-left: -100%; - font-size: 90%; - word-wrap: break-word; - overflow-wrap : break-word; -} - -div.sphinxsidebar ul { - list-style: none; -} - -div.sphinxsidebar ul ul, -div.sphinxsidebar ul.want-points { - margin-left: 20px; - list-style: square; -} - -div.sphinxsidebar ul ul { - margin-top: 0; - margin-bottom: 0; -} - -div.sphinxsidebar form { - margin-top: 10px; -} - -div.sphinxsidebar input { - border: 1px solid #98dbcc; - font-family: sans-serif; - font-size: 1em; -} - -div.sphinxsidebar #searchbox input[type="text"] { - width: 170px; -} - -img { - border: 0; - max-width: 100%; -} - -/* -- search page ----------------------------------------------------------- */ - -ul.search { - margin: 10px 0 0 20px; - padding: 0; -} - -ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; -} - -ul.search li a { - font-weight: bold; -} - -ul.search li div.context { - color: #888; - margin: 2px 0 0 30px; - text-align: left; -} - -ul.keywordmatches li.goodmatch a { - font-weight: bold; -} - -/* -- index page ------------------------------------------------------------ */ - -table.contentstable { - width: 90%; -} - -table.contentstable p.biglink { - line-height: 150%; -} - -a.biglink { - font-size: 1.3em; -} - -span.linkdescr { - font-style: italic; - padding-top: 5px; - font-size: 90%; -} - -/* -- general index --------------------------------------------------------- */ - -table.indextable { - width: 100%; -} - -table.indextable td { - text-align: left; - vertical-align: top; -} - -table.indextable dl, table.indextable dd { - margin-top: 0; - margin-bottom: 0; -} - -table.indextable tr.pcap { - height: 10px; -} - -table.indextable tr.cap { - margin-top: 10px; - background-color: #f2f2f2; -} - -img.toggler { - margin-right: 3px; - margin-top: 3px; - cursor: pointer; -} - -div.modindex-jumpbox { - border-top: 1px solid #ddd; - border-bottom: 1px solid #ddd; - margin: 1em 0 1em 0; - padding: 0.4em; -} - -div.genindex-jumpbox { - border-top: 1px solid #ddd; - border-bottom: 1px solid #ddd; - margin: 1em 0 1em 0; - padding: 0.4em; -} - -/* -- general body styles --------------------------------------------------- */ - -div.body p, div.body dd, div.body li, div.body blockquote { - -moz-hyphens: auto; - -ms-hyphens: auto; - -webkit-hyphens: auto; - hyphens: auto; -} - -a.headerlink { - visibility: hidden; -} - -h1:hover > a.headerlink, -h2:hover > a.headerlink, -h3:hover > a.headerlink, -h4:hover > a.headerlink, -h5:hover > a.headerlink, -h6:hover > a.headerlink, -dt:hover > a.headerlink, -caption:hover > a.headerlink, -p.caption:hover > a.headerlink, -div.code-block-caption:hover > a.headerlink { - visibility: visible; -} - -div.body p.caption { - text-align: inherit; -} - -div.body td { - text-align: left; -} - -.field-list ul { - padding-left: 1em; -} - -.first { - margin-top: 0 !important; -} - -p.rubric { - margin-top: 30px; - font-weight: bold; -} - -img.align-left, .figure.align-left, object.align-left { - clear: left; - float: left; - margin-right: 1em; -} - -img.align-right, .figure.align-right, object.align-right { - clear: right; - float: right; - margin-left: 1em; -} - -img.align-center, .figure.align-center, object.align-center { - display: block; - margin-left: auto; - margin-right: auto; -} - -.align-left { - text-align: left; -} - -.align-center { - text-align: center; -} - -.align-right { - text-align: right; -} - -/* -- sidebars -------------------------------------------------------------- */ - -div.sidebar { - margin: 0 0 0.5em 1em; - border: 1px solid #ddb; - padding: 7px 7px 0 7px; - background-color: #ffe; - width: 40%; - float: right; -} - -p.sidebar-title { - font-weight: bold; -} - -/* -- topics ---------------------------------------------------------------- */ - -div.topic { - border: 1px solid #ccc; - padding: 7px 7px 0 7px; - margin: 10px 0 10px 0; -} - -p.topic-title { - font-size: 1.1em; - font-weight: bold; - margin-top: 10px; -} - -/* -- admonitions ----------------------------------------------------------- */ - -div.admonition { - margin-top: 10px; - margin-bottom: 10px; - padding: 7px; -} - -div.admonition dt { - font-weight: bold; -} - -div.admonition dl { - margin-bottom: 0; -} - -p.admonition-title { - margin: 0px 10px 5px 0px; - font-weight: bold; -} - -div.body p.centered { - text-align: center; - margin-top: 25px; -} - -/* -- tables ---------------------------------------------------------------- */ - -table.docutils { - border: 0; - border-collapse: collapse; -} - -table caption span.caption-number { - font-style: italic; -} - -table caption span.caption-text { -} - -table.docutils td, table.docutils th { - padding: 1px 8px 1px 5px; - border-top: 0; - border-left: 0; - border-right: 0; - border-bottom: 1px solid #aaa; -} - -table.field-list td, table.field-list th { - border: 0 !important; -} - -table.footnote td, table.footnote th { - border: 0 !important; -} - -th { - text-align: left; - padding-right: 5px; -} - -table.citation { - border-left: solid 1px gray; - margin-left: 1px; -} - -table.citation td { - border-bottom: none; -} - -/* -- figures --------------------------------------------------------------- */ - -div.figure { - margin: 0.5em; - padding: 0.5em; -} - -div.figure p.caption { - padding: 0.3em; -} - -div.figure p.caption span.caption-number { - font-style: italic; -} - -div.figure p.caption span.caption-text { -} - - -/* -- other body styles ----------------------------------------------------- */ - -ol.arabic { - list-style: decimal; -} - -ol.loweralpha { - list-style: lower-alpha; -} - -ol.upperalpha { - list-style: upper-alpha; -} - -ol.lowerroman { - list-style: lower-roman; -} - -ol.upperroman { - list-style: upper-roman; -} - -dl { - margin-bottom: 15px; -} - -dd p { - margin-top: 0px; -} - -dd ul, dd table { - margin-bottom: 10px; -} - -dd { - margin-top: 3px; - margin-bottom: 10px; - margin-left: 30px; -} - -dt:target, .highlighted { - background-color: #fbe54e; -} - -dl.glossary dt { - font-weight: bold; - font-size: 1.1em; -} - -.field-list ul { - margin: 0; - padding-left: 1em; -} - -.field-list p { - margin: 0; -} - -.optional { - font-size: 1.3em; -} - -.sig-paren { - font-size: larger; -} - -.versionmodified { - font-style: italic; -} - -.system-message { - background-color: #fda; - padding: 5px; - border: 3px solid red; -} - -.footnote:target { - background-color: #ffa; -} - -.line-block { - display: block; - margin-top: 1em; - margin-bottom: 1em; -} - -.line-block .line-block { - margin-top: 0; - margin-bottom: 0; - margin-left: 1.5em; -} - -.guilabel, .menuselection { - font-family: sans-serif; -} - -.accelerator { - text-decoration: underline; -} - -.classifier { - font-style: oblique; -} - -abbr, acronym { - border-bottom: dotted 1px; - cursor: help; -} - -/* -- code displays --------------------------------------------------------- */ - -pre { - overflow: auto; - overflow-y: hidden; /* fixes display issues on Chrome browsers */ -} - -td.linenos pre { - padding: 5px 0px; - border: 0; - background-color: transparent; - color: #aaa; -} - -table.highlighttable { - margin-left: 0.5em; -} - -table.highlighttable td { - padding: 0 0.5em 0 0.5em; -} - -div.code-block-caption { - padding: 2px 5px; - font-size: small; -} - -div.code-block-caption code { - background-color: transparent; -} - -div.code-block-caption + div > div.highlight > pre { - margin-top: 0; -} - -div.code-block-caption span.caption-number { - padding: 0.1em 0.3em; - font-style: italic; -} - -div.code-block-caption span.caption-text { -} - -div.literal-block-wrapper { - padding: 1em 1em 0; -} - -div.literal-block-wrapper div.highlight { - margin: 0; -} - -code.descname { - background-color: transparent; - font-weight: bold; - font-size: 1.2em; -} - -code.descclassname { - background-color: transparent; -} - -code.xref, a code { - background-color: transparent; - font-weight: bold; -} - -h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { - background-color: transparent; -} - -.viewcode-link { - float: right; -} - -.viewcode-back { - float: right; - font-family: sans-serif; -} - -div.viewcode-block:target { - margin: -1px -10px; - padding: 0 10px; -} - -/* -- math display ---------------------------------------------------------- */ - -img.math { - vertical-align: middle; -} - -div.body div.math p { - text-align: center; -} - -span.eqno { - float: right; -} - -/* -- printout stylesheet --------------------------------------------------- */ - -@media print { - div.document, - div.documentwrapper, - div.bodywrapper { - margin: 0 !important; - width: 100%; - } - - div.sphinxsidebar, - div.related, - div.footer, - #top-link { - display: none; - } -} \ No newline at end of file diff --git a/docs/_build/html/_static/classic.css b/docs/_build/html/_static/classic.css deleted file mode 100644 index d98894b3..00000000 --- a/docs/_build/html/_static/classic.css +++ /dev/null @@ -1,261 +0,0 @@ -/* - * default.css_t - * ~~~~~~~~~~~~~ - * - * Sphinx stylesheet -- default theme. - * - * :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ - -@import url("basic.css"); - -/* -- page layout ----------------------------------------------------------- */ - -body { - font-family: sans-serif; - font-size: 100%; - background-color: #11303d; - color: #000; - margin: 0; - padding: 0; -} - -div.document { - background-color: #1c4e63; -} - -div.documentwrapper { - float: left; - width: 100%; -} - -div.bodywrapper { - margin: 0 0 0 230px; -} - -div.body { - background-color: #ffffff; - color: #000000; - padding: 0 20px 30px 20px; -} - -div.footer { - color: #ffffff; - width: 100%; - padding: 9px 0 9px 0; - text-align: center; - font-size: 75%; -} - -div.footer a { - color: #ffffff; - text-decoration: underline; -} - -div.related { - background-color: #133f52; - line-height: 30px; - color: #ffffff; -} - -div.related a { - color: #ffffff; -} - -div.sphinxsidebar { -} - -div.sphinxsidebar h3 { - font-family: 'Trebuchet MS', sans-serif; - color: #ffffff; - font-size: 1.4em; - font-weight: normal; - margin: 0; - padding: 0; -} - -div.sphinxsidebar h3 a { - color: #ffffff; -} - -div.sphinxsidebar h4 { - font-family: 'Trebuchet MS', sans-serif; - color: #ffffff; - font-size: 1.3em; - font-weight: normal; - margin: 5px 0 0 0; - padding: 0; -} - -div.sphinxsidebar p { - color: #ffffff; -} - -div.sphinxsidebar p.topless { - margin: 5px 10px 10px 10px; -} - -div.sphinxsidebar ul { - margin: 10px; - padding: 0; - color: #ffffff; -} - -div.sphinxsidebar a { - color: #98dbcc; -} - -div.sphinxsidebar input { - border: 1px solid #98dbcc; - font-family: sans-serif; - font-size: 1em; -} - - - -/* -- hyperlink styles ------------------------------------------------------ */ - -a { - color: #355f7c; - text-decoration: none; -} - -a:visited { - color: #355f7c; - text-decoration: none; -} - -a:hover { - text-decoration: underline; -} - - - -/* -- body styles ----------------------------------------------------------- */ - -div.body h1, -div.body h2, -div.body h3, -div.body h4, -div.body h5, -div.body h6 { - font-family: 'Trebuchet MS', sans-serif; - background-color: #f2f2f2; - font-weight: normal; - color: #20435c; - border-bottom: 1px solid #ccc; - margin: 20px -20px 10px -20px; - padding: 3px 0 3px 10px; -} - -div.body h1 { margin-top: 0; font-size: 200%; } -div.body h2 { font-size: 160%; } -div.body h3 { font-size: 140%; } -div.body h4 { font-size: 120%; } -div.body h5 { font-size: 110%; } -div.body h6 { font-size: 100%; } - -a.headerlink { - color: #c60f0f; - font-size: 0.8em; - padding: 0 4px 0 4px; - text-decoration: none; -} - -a.headerlink:hover { - background-color: #c60f0f; - color: white; -} - -div.body p, div.body dd, div.body li, div.body blockquote { - text-align: justify; - line-height: 130%; -} - -div.admonition p.admonition-title + p { - display: inline; -} - -div.admonition p { - margin-bottom: 5px; -} - -div.admonition pre { - margin-bottom: 5px; -} - -div.admonition ul, div.admonition ol { - margin-bottom: 5px; -} - -div.note { - background-color: #eee; - border: 1px solid #ccc; -} - -div.seealso { - background-color: #ffc; - border: 1px solid #ff6; -} - -div.topic { - background-color: #eee; -} - -div.warning { - background-color: #ffe4e4; - border: 1px solid #f66; -} - -p.admonition-title { - display: inline; -} - -p.admonition-title:after { - content: ":"; -} - -pre { - padding: 5px; - background-color: #eeffcc; - color: #333333; - line-height: 120%; - border: 1px solid #ac9; - border-left: none; - border-right: none; -} - -code { - background-color: #ecf0f3; - padding: 0 1px 0 1px; - font-size: 0.95em; -} - -th { - background-color: #ede; -} - -.warning code { - background: #efc2c2; -} - -.note code { - background: #d6d6d6; -} - -.viewcode-back { - font-family: sans-serif; -} - -div.viewcode-block:target { - background-color: #f4debf; - border-top: 1px solid #ac9; - border-bottom: 1px solid #ac9; -} - -div.code-block-caption { - color: #efefef; - background-color: #1c4e63; -} \ No newline at end of file diff --git a/docs/_build/html/_static/comment-bright.png b/docs/_build/html/_static/comment-bright.png deleted file mode 100644 index 551517b8..00000000 Binary files a/docs/_build/html/_static/comment-bright.png and /dev/null differ diff --git a/docs/_build/html/_static/comment-close.png b/docs/_build/html/_static/comment-close.png deleted file mode 100644 index 09b54be4..00000000 Binary files a/docs/_build/html/_static/comment-close.png and /dev/null differ diff --git a/docs/_build/html/_static/comment.png b/docs/_build/html/_static/comment.png deleted file mode 100644 index 92feb52b..00000000 Binary files a/docs/_build/html/_static/comment.png and /dev/null differ diff --git a/docs/_build/html/_static/custom.css b/docs/_build/html/_static/custom.css deleted file mode 100644 index 2a924f1d..00000000 --- a/docs/_build/html/_static/custom.css +++ /dev/null @@ -1 +0,0 @@ -/* This file intentionally left blank. */ diff --git a/docs/_build/html/_static/doctools.js b/docs/_build/html/_static/doctools.js deleted file mode 100644 index 81634956..00000000 --- a/docs/_build/html/_static/doctools.js +++ /dev/null @@ -1,287 +0,0 @@ -/* - * doctools.js - * ~~~~~~~~~~~ - * - * Sphinx JavaScript utilities for all documentation. - * - * :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ - -/** - * select a different prefix for underscore - */ -$u = _.noConflict(); - -/** - * make the code below compatible with browsers without - * an installed firebug like debugger -if (!window.console || !console.firebug) { - var names = ["log", "debug", "info", "warn", "error", "assert", "dir", - "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace", - "profile", "profileEnd"]; - window.console = {}; - for (var i = 0; i < names.length; ++i) - window.console[names[i]] = function() {}; -} - */ - -/** - * small helper function to urldecode strings - */ -jQuery.urldecode = function(x) { - return decodeURIComponent(x).replace(/\+/g, ' '); -}; - -/** - * small helper function to urlencode strings - */ -jQuery.urlencode = encodeURIComponent; - -/** - * This function returns the parsed url parameters of the - * current request. Multiple values per key are supported, - * it will always return arrays of strings for the value parts. - */ -jQuery.getQueryParameters = function(s) { - if (typeof s == 'undefined') - s = document.location.search; - var parts = s.substr(s.indexOf('?') + 1).split('&'); - var result = {}; - for (var i = 0; i < parts.length; i++) { - var tmp = parts[i].split('=', 2); - var key = jQuery.urldecode(tmp[0]); - var value = jQuery.urldecode(tmp[1]); - if (key in result) - result[key].push(value); - else - result[key] = [value]; - } - return result; -}; - -/** - * highlight a given string on a jquery object by wrapping it in - * span elements with the given class name. - */ -jQuery.fn.highlightText = function(text, className) { - function highlight(node) { - if (node.nodeType == 3) { - var val = node.nodeValue; - var pos = val.toLowerCase().indexOf(text); - if (pos >= 0 && !jQuery(node.parentNode).hasClass(className)) { - var span = document.createElement("span"); - span.className = className; - span.appendChild(document.createTextNode(val.substr(pos, text.length))); - node.parentNode.insertBefore(span, node.parentNode.insertBefore( - document.createTextNode(val.substr(pos + text.length)), - node.nextSibling)); - node.nodeValue = val.substr(0, pos); - } - } - else if (!jQuery(node).is("button, select, textarea")) { - jQuery.each(node.childNodes, function() { - highlight(this); - }); - } - } - return this.each(function() { - highlight(this); - }); -}; - -/* - * backward compatibility for jQuery.browser - * This will be supported until firefox bug is fixed. - */ -if (!jQuery.browser) { - jQuery.uaMatch = function(ua) { - ua = ua.toLowerCase(); - - var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || - /(webkit)[ \/]([\w.]+)/.exec(ua) || - /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || - /(msie) ([\w.]+)/.exec(ua) || - ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || - []; - - return { - browser: match[ 1 ] || "", - version: match[ 2 ] || "0" - }; - }; - jQuery.browser = {}; - jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; -} - -/** - * Small JavaScript module for the documentation. - */ -var Documentation = { - - init : function() { - this.fixFirefoxAnchorBug(); - this.highlightSearchWords(); - this.initIndexTable(); - - }, - - /** - * i18n support - */ - TRANSLATIONS : {}, - PLURAL_EXPR : function(n) { return n == 1 ? 0 : 1; }, - LOCALE : 'unknown', - - // gettext and ngettext don't access this so that the functions - // can safely bound to a different name (_ = Documentation.gettext) - gettext : function(string) { - var translated = Documentation.TRANSLATIONS[string]; - if (typeof translated == 'undefined') - return string; - return (typeof translated == 'string') ? translated : translated[0]; - }, - - ngettext : function(singular, plural, n) { - var translated = Documentation.TRANSLATIONS[singular]; - if (typeof translated == 'undefined') - return (n == 1) ? singular : plural; - return translated[Documentation.PLURALEXPR(n)]; - }, - - addTranslations : function(catalog) { - for (var key in catalog.messages) - this.TRANSLATIONS[key] = catalog.messages[key]; - this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')'); - this.LOCALE = catalog.locale; - }, - - /** - * add context elements like header anchor links - */ - addContextElements : function() { - $('div[id] > :header:first').each(function() { - $('\u00B6'). - attr('href', '#' + this.id). - attr('title', _('Permalink to this headline')). - appendTo(this); - }); - $('dt[id]').each(function() { - $('\u00B6'). - attr('href', '#' + this.id). - attr('title', _('Permalink to this definition')). - appendTo(this); - }); - }, - - /** - * workaround a firefox stupidity - * see: https://bugzilla.mozilla.org/show_bug.cgi?id=645075 - */ - fixFirefoxAnchorBug : function() { - if (document.location.hash) - window.setTimeout(function() { - document.location.href += ''; - }, 10); - }, - - /** - * highlight the search words provided in the url in the text - */ - highlightSearchWords : function() { - var params = $.getQueryParameters(); - var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : []; - if (terms.length) { - var body = $('div.body'); - if (!body.length) { - body = $('body'); - } - window.setTimeout(function() { - $.each(terms, function() { - body.highlightText(this.toLowerCase(), 'highlighted'); - }); - }, 10); - $('') - .appendTo($('#searchbox')); - } - }, - - /** - * init the domain index toggle buttons - */ - initIndexTable : function() { - var togglers = $('img.toggler').click(function() { - var src = $(this).attr('src'); - var idnum = $(this).attr('id').substr(7); - $('tr.cg-' + idnum).toggle(); - if (src.substr(-9) == 'minus.png') - $(this).attr('src', src.substr(0, src.length-9) + 'plus.png'); - else - $(this).attr('src', src.substr(0, src.length-8) + 'minus.png'); - }).css('display', ''); - if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) { - togglers.click(); - } - }, - - /** - * helper function to hide the search marks again - */ - hideSearchWords : function() { - $('#searchbox .highlight-link').fadeOut(300); - $('span.highlighted').removeClass('highlighted'); - }, - - /** - * make the url absolute - */ - makeURL : function(relativeURL) { - return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL; - }, - - /** - * get the current relative url - */ - getCurrentURL : function() { - var path = document.location.pathname; - var parts = path.split(/\//); - $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() { - if (this == '..') - parts.pop(); - }); - var url = parts.join('/'); - return path.substring(url.lastIndexOf('/') + 1, path.length - 1); - }, - - initOnKeyListeners: function() { - $(document).keyup(function(event) { - var activeElementType = document.activeElement.tagName; - // don't navigate when in search box or textarea - if (activeElementType !== 'TEXTAREA' && activeElementType !== 'INPUT' && activeElementType !== 'SELECT') { - switch (event.keyCode) { - case 37: // left - var prevHref = $('link[rel="prev"]').prop('href'); - if (prevHref) { - window.location.href = prevHref; - return false; - } - case 39: // right - var nextHref = $('link[rel="next"]').prop('href'); - if (nextHref) { - window.location.href = nextHref; - return false; - } - } - } - }); - } -}; - -// quick alias for translations -_ = Documentation.gettext; - -$(document).ready(function() { - Documentation.init(); -}); \ No newline at end of file diff --git a/docs/_build/html/_static/down-pressed.png b/docs/_build/html/_static/down-pressed.png deleted file mode 100644 index 7c30d004..00000000 Binary files a/docs/_build/html/_static/down-pressed.png and /dev/null differ diff --git a/docs/_build/html/_static/down.png b/docs/_build/html/_static/down.png deleted file mode 100644 index f48098a4..00000000 Binary files a/docs/_build/html/_static/down.png and /dev/null differ diff --git a/docs/_build/html/_static/file.png b/docs/_build/html/_static/file.png deleted file mode 100644 index 254c60bf..00000000 Binary files a/docs/_build/html/_static/file.png and /dev/null differ diff --git a/docs/_build/html/_static/jquery-1.11.1.js b/docs/_build/html/_static/jquery-1.11.1.js deleted file mode 100644 index d4b67f7e..00000000 --- a/docs/_build/html/_static/jquery-1.11.1.js +++ /dev/null @@ -1,10308 +0,0 @@ -/*! - * jQuery JavaScript Library v1.11.1 - * http://jquery.com/ - * - * Includes Sizzle.js - * http://sizzlejs.com/ - * - * Copyright 2005, 2014 jQuery Foundation, Inc. and other contributors - * Released under the MIT license - * http://jquery.org/license - * - * Date: 2014-05-01T17:42Z - */ - -(function( global, factory ) { - - if ( typeof module === "object" && typeof module.exports === "object" ) { - // For CommonJS and CommonJS-like environments where a proper window is present, - // execute the factory and get jQuery - // For environments that do not inherently posses a window with a document - // (such as Node.js), expose a jQuery-making factory as module.exports - // This accentuates the need for the creation of a real window - // e.g. var jQuery = require("jquery")(window); - // See ticket #14549 for more info - module.exports = global.document ? - factory( global, true ) : - function( w ) { - if ( !w.document ) { - throw new Error( "jQuery requires a window with a document" ); - } - return factory( w ); - }; - } else { - factory( global ); - } - -// Pass this if window is not defined yet -}(typeof window !== "undefined" ? window : this, function( window, noGlobal ) { - -// Can't do this because several apps including ASP.NET trace -// the stack via arguments.caller.callee and Firefox dies if -// you try to trace through "use strict" call chains. (#13335) -// Support: Firefox 18+ -// - -var deletedIds = []; - -var slice = deletedIds.slice; - -var concat = deletedIds.concat; - -var push = deletedIds.push; - -var indexOf = deletedIds.indexOf; - -var class2type = {}; - -var toString = class2type.toString; - -var hasOwn = class2type.hasOwnProperty; - -var support = {}; - - - -var - version = "1.11.1", - - // Define a local copy of jQuery - jQuery = function( selector, context ) { - // The jQuery object is actually just the init constructor 'enhanced' - // Need init if jQuery is called (just allow error to be thrown if not included) - return new jQuery.fn.init( selector, context ); - }, - - // Support: Android<4.1, IE<9 - // Make sure we trim BOM and NBSP - rtrim = /^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g, - - // Matches dashed string for camelizing - rmsPrefix = /^-ms-/, - rdashAlpha = /-([\da-z])/gi, - - // Used by jQuery.camelCase as callback to replace() - fcamelCase = function( all, letter ) { - return letter.toUpperCase(); - }; - -jQuery.fn = jQuery.prototype = { - // The current version of jQuery being used - jquery: version, - - constructor: jQuery, - - // Start with an empty selector - selector: "", - - // The default length of a jQuery object is 0 - length: 0, - - toArray: function() { - return slice.call( this ); - }, - - // Get the Nth element in the matched element set OR - // Get the whole matched element set as a clean array - get: function( num ) { - return num != null ? - - // Return just the one element from the set - ( num < 0 ? this[ num + this.length ] : this[ num ] ) : - - // Return all the elements in a clean array - slice.call( this ); - }, - - // Take an array of elements and push it onto the stack - // (returning the new matched element set) - pushStack: function( elems ) { - - // Build a new jQuery matched element set - var ret = jQuery.merge( this.constructor(), elems ); - - // Add the old object onto the stack (as a reference) - ret.prevObject = this; - ret.context = this.context; - - // Return the newly-formed element set - return ret; - }, - - // Execute a callback for every element in the matched set. - // (You can seed the arguments with an array of args, but this is - // only used internally.) - each: function( callback, args ) { - return jQuery.each( this, callback, args ); - }, - - map: function( callback ) { - return this.pushStack( jQuery.map(this, function( elem, i ) { - return callback.call( elem, i, elem ); - })); - }, - - slice: function() { - return this.pushStack( slice.apply( this, arguments ) ); - }, - - first: function() { - return this.eq( 0 ); - }, - - last: function() { - return this.eq( -1 ); - }, - - eq: function( i ) { - var len = this.length, - j = +i + ( i < 0 ? len : 0 ); - return this.pushStack( j >= 0 && j < len ? [ this[j] ] : [] ); - }, - - end: function() { - return this.prevObject || this.constructor(null); - }, - - // For internal use only. - // Behaves like an Array's method, not like a jQuery method. - push: push, - sort: deletedIds.sort, - splice: deletedIds.splice -}; - -jQuery.extend = jQuery.fn.extend = function() { - var src, copyIsArray, copy, name, options, clone, - target = arguments[0] || {}, - i = 1, - length = arguments.length, - deep = false; - - // Handle a deep copy situation - if ( typeof target === "boolean" ) { - deep = target; - - // skip the boolean and the target - target = arguments[ i ] || {}; - i++; - } - - // Handle case when target is a string or something (possible in deep copy) - if ( typeof target !== "object" && !jQuery.isFunction(target) ) { - target = {}; - } - - // extend jQuery itself if only one argument is passed - if ( i === length ) { - target = this; - i--; - } - - for ( ; i < length; i++ ) { - // Only deal with non-null/undefined values - if ( (options = arguments[ i ]) != null ) { - // Extend the base object - for ( name in options ) { - src = target[ name ]; - copy = options[ name ]; - - // Prevent never-ending loop - if ( target === copy ) { - continue; - } - - // Recurse if we're merging plain objects or arrays - if ( deep && copy && ( jQuery.isPlainObject(copy) || (copyIsArray = jQuery.isArray(copy)) ) ) { - if ( copyIsArray ) { - copyIsArray = false; - clone = src && jQuery.isArray(src) ? src : []; - - } else { - clone = src && jQuery.isPlainObject(src) ? src : {}; - } - - // Never move original objects, clone them - target[ name ] = jQuery.extend( deep, clone, copy ); - - // Don't bring in undefined values - } else if ( copy !== undefined ) { - target[ name ] = copy; - } - } - } - } - - // Return the modified object - return target; -}; - -jQuery.extend({ - // Unique for each copy of jQuery on the page - expando: "jQuery" + ( version + Math.random() ).replace( /\D/g, "" ), - - // Assume jQuery is ready without the ready module - isReady: true, - - error: function( msg ) { - throw new Error( msg ); - }, - - noop: function() {}, - - // See test/unit/core.js for details concerning isFunction. - // Since version 1.3, DOM methods and functions like alert - // aren't supported. They return false on IE (#2968). - isFunction: function( obj ) { - return jQuery.type(obj) === "function"; - }, - - isArray: Array.isArray || function( obj ) { - return jQuery.type(obj) === "array"; - }, - - isWindow: function( obj ) { - /* jshint eqeqeq: false */ - return obj != null && obj == obj.window; - }, - - isNumeric: function( obj ) { - // parseFloat NaNs numeric-cast false positives (null|true|false|"") - // ...but misinterprets leading-number strings, particularly hex literals ("0x...") - // subtraction forces infinities to NaN - return !jQuery.isArray( obj ) && obj - parseFloat( obj ) >= 0; - }, - - isEmptyObject: function( obj ) { - var name; - for ( name in obj ) { - return false; - } - return true; - }, - - isPlainObject: function( obj ) { - var key; - - // Must be an Object. - // Because of IE, we also have to check the presence of the constructor property. - // Make sure that DOM nodes and window objects don't pass through, as well - if ( !obj || jQuery.type(obj) !== "object" || obj.nodeType || jQuery.isWindow( obj ) ) { - return false; - } - - try { - // Not own constructor property must be Object - if ( obj.constructor && - !hasOwn.call(obj, "constructor") && - !hasOwn.call(obj.constructor.prototype, "isPrototypeOf") ) { - return false; - } - } catch ( e ) { - // IE8,9 Will throw exceptions on certain host objects #9897 - return false; - } - - // Support: IE<9 - // Handle iteration over inherited properties before own properties. - if ( support.ownLast ) { - for ( key in obj ) { - return hasOwn.call( obj, key ); - } - } - - // Own properties are enumerated firstly, so to speed up, - // if last one is own, then all properties are own. - for ( key in obj ) {} - - return key === undefined || hasOwn.call( obj, key ); - }, - - type: function( obj ) { - if ( obj == null ) { - return obj + ""; - } - return typeof obj === "object" || typeof obj === "function" ? - class2type[ toString.call(obj) ] || "object" : - typeof obj; - }, - - // Evaluates a script in a global context - // Workarounds based on findings by Jim Driscoll - // http://weblogs.java.net/blog/driscoll/archive/2009/09/08/eval-javascript-global-context - globalEval: function( data ) { - if ( data && jQuery.trim( data ) ) { - // We use execScript on Internet Explorer - // We use an anonymous function so that context is window - // rather than jQuery in Firefox - ( window.execScript || function( data ) { - window[ "eval" ].call( window, data ); - } )( data ); - } - }, - - // Convert dashed to camelCase; used by the css and data modules - // Microsoft forgot to hump their vendor prefix (#9572) - camelCase: function( string ) { - return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); - }, - - nodeName: function( elem, name ) { - return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); - }, - - // args is for internal usage only - each: function( obj, callback, args ) { - var value, - i = 0, - length = obj.length, - isArray = isArraylike( obj ); - - if ( args ) { - if ( isArray ) { - for ( ; i < length; i++ ) { - value = callback.apply( obj[ i ], args ); - - if ( value === false ) { - break; - } - } - } else { - for ( i in obj ) { - value = callback.apply( obj[ i ], args ); - - if ( value === false ) { - break; - } - } - } - - // A special, fast, case for the most common use of each - } else { - if ( isArray ) { - for ( ; i < length; i++ ) { - value = callback.call( obj[ i ], i, obj[ i ] ); - - if ( value === false ) { - break; - } - } - } else { - for ( i in obj ) { - value = callback.call( obj[ i ], i, obj[ i ] ); - - if ( value === false ) { - break; - } - } - } - } - - return obj; - }, - - // Support: Android<4.1, IE<9 - trim: function( text ) { - return text == null ? - "" : - ( text + "" ).replace( rtrim, "" ); - }, - - // results is for internal usage only - makeArray: function( arr, results ) { - var ret = results || []; - - if ( arr != null ) { - if ( isArraylike( Object(arr) ) ) { - jQuery.merge( ret, - typeof arr === "string" ? - [ arr ] : arr - ); - } else { - push.call( ret, arr ); - } - } - - return ret; - }, - - inArray: function( elem, arr, i ) { - var len; - - if ( arr ) { - if ( indexOf ) { - return indexOf.call( arr, elem, i ); - } - - len = arr.length; - i = i ? i < 0 ? Math.max( 0, len + i ) : i : 0; - - for ( ; i < len; i++ ) { - // Skip accessing in sparse arrays - if ( i in arr && arr[ i ] === elem ) { - return i; - } - } - } - - return -1; - }, - - merge: function( first, second ) { - var len = +second.length, - j = 0, - i = first.length; - - while ( j < len ) { - first[ i++ ] = second[ j++ ]; - } - - // Support: IE<9 - // Workaround casting of .length to NaN on otherwise arraylike objects (e.g., NodeLists) - if ( len !== len ) { - while ( second[j] !== undefined ) { - first[ i++ ] = second[ j++ ]; - } - } - - first.length = i; - - return first; - }, - - grep: function( elems, callback, invert ) { - var callbackInverse, - matches = [], - i = 0, - length = elems.length, - callbackExpect = !invert; - - // Go through the array, only saving the items - // that pass the validator function - for ( ; i < length; i++ ) { - callbackInverse = !callback( elems[ i ], i ); - if ( callbackInverse !== callbackExpect ) { - matches.push( elems[ i ] ); - } - } - - return matches; - }, - - // arg is for internal usage only - map: function( elems, callback, arg ) { - var value, - i = 0, - length = elems.length, - isArray = isArraylike( elems ), - ret = []; - - // Go through the array, translating each of the items to their new values - if ( isArray ) { - for ( ; i < length; i++ ) { - value = callback( elems[ i ], i, arg ); - - if ( value != null ) { - ret.push( value ); - } - } - - // Go through every key on the object, - } else { - for ( i in elems ) { - value = callback( elems[ i ], i, arg ); - - if ( value != null ) { - ret.push( value ); - } - } - } - - // Flatten any nested arrays - return concat.apply( [], ret ); - }, - - // A global GUID counter for objects - guid: 1, - - // Bind a function to a context, optionally partially applying any - // arguments. - proxy: function( fn, context ) { - var args, proxy, tmp; - - if ( typeof context === "string" ) { - tmp = fn[ context ]; - context = fn; - fn = tmp; - } - - // Quick check to determine if target is callable, in the spec - // this throws a TypeError, but we will just return undefined. - if ( !jQuery.isFunction( fn ) ) { - return undefined; - } - - // Simulated bind - args = slice.call( arguments, 2 ); - proxy = function() { - return fn.apply( context || this, args.concat( slice.call( arguments ) ) ); - }; - - // Set the guid of unique handler to the same of original handler, so it can be removed - proxy.guid = fn.guid = fn.guid || jQuery.guid++; - - return proxy; - }, - - now: function() { - return +( new Date() ); - }, - - // jQuery.support is not used in Core but other projects attach their - // properties to it so it needs to exist. - support: support -}); - -// Populate the class2type map -jQuery.each("Boolean Number String Function Array Date RegExp Object Error".split(" "), function(i, name) { - class2type[ "[object " + name + "]" ] = name.toLowerCase(); -}); - -function isArraylike( obj ) { - var length = obj.length, - type = jQuery.type( obj ); - - if ( type === "function" || jQuery.isWindow( obj ) ) { - return false; - } - - if ( obj.nodeType === 1 && length ) { - return true; - } - - return type === "array" || length === 0 || - typeof length === "number" && length > 0 && ( length - 1 ) in obj; -} -var Sizzle = -/*! - * Sizzle CSS Selector Engine v1.10.19 - * http://sizzlejs.com/ - * - * Copyright 2013 jQuery Foundation, Inc. and other contributors - * Released under the MIT license - * http://jquery.org/license - * - * Date: 2014-04-18 - */ -(function( window ) { - -var i, - support, - Expr, - getText, - isXML, - tokenize, - compile, - select, - outermostContext, - sortInput, - hasDuplicate, - - // Local document vars - setDocument, - document, - docElem, - documentIsHTML, - rbuggyQSA, - rbuggyMatches, - matches, - contains, - - // Instance-specific data - expando = "sizzle" + -(new Date()), - preferredDoc = window.document, - dirruns = 0, - done = 0, - classCache = createCache(), - tokenCache = createCache(), - compilerCache = createCache(), - sortOrder = function( a, b ) { - if ( a === b ) { - hasDuplicate = true; - } - return 0; - }, - - // General-purpose constants - strundefined = typeof undefined, - MAX_NEGATIVE = 1 << 31, - - // Instance methods - hasOwn = ({}).hasOwnProperty, - arr = [], - pop = arr.pop, - push_native = arr.push, - push = arr.push, - slice = arr.slice, - // Use a stripped-down indexOf if we can't use a native one - indexOf = arr.indexOf || function( elem ) { - var i = 0, - len = this.length; - for ( ; i < len; i++ ) { - if ( this[i] === elem ) { - return i; - } - } - return -1; - }, - - booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped", - - // Regular expressions - - // Whitespace characters http://www.w3.org/TR/css3-selectors/#whitespace - whitespace = "[\\x20\\t\\r\\n\\f]", - // http://www.w3.org/TR/css3-syntax/#characters - characterEncoding = "(?:\\\\.|[\\w-]|[^\\x00-\\xa0])+", - - // Loosely modeled on CSS identifier characters - // An unquoted value should be a CSS identifier http://www.w3.org/TR/css3-selectors/#attribute-selectors - // Proper syntax: http://www.w3.org/TR/CSS21/syndata.html#value-def-identifier - identifier = characterEncoding.replace( "w", "w#" ), - - // Attribute selectors: http://www.w3.org/TR/selectors/#attribute-selectors - attributes = "\\[" + whitespace + "*(" + characterEncoding + ")(?:" + whitespace + - // Operator (capture 2) - "*([*^$|!~]?=)" + whitespace + - // "Attribute values must be CSS identifiers [capture 5] or strings [capture 3 or capture 4]" - "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" + whitespace + - "*\\]", - - pseudos = ":(" + characterEncoding + ")(?:\\((" + - // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments: - // 1. quoted (capture 3; capture 4 or capture 5) - "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" + - // 2. simple (capture 6) - "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" + - // 3. anything else (capture 2) - ".*" + - ")\\)|)", - - // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter - rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + whitespace + "+$", "g" ), - - rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), - rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + "*" ), - - rattributeQuotes = new RegExp( "=" + whitespace + "*([^\\]'\"]*?)" + whitespace + "*\\]", "g" ), - - rpseudo = new RegExp( pseudos ), - ridentifier = new RegExp( "^" + identifier + "$" ), - - matchExpr = { - "ID": new RegExp( "^#(" + characterEncoding + ")" ), - "CLASS": new RegExp( "^\\.(" + characterEncoding + ")" ), - "TAG": new RegExp( "^(" + characterEncoding.replace( "w", "w*" ) + ")" ), - "ATTR": new RegExp( "^" + attributes ), - "PSEUDO": new RegExp( "^" + pseudos ), - "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + whitespace + - "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + whitespace + - "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), - "bool": new RegExp( "^(?:" + booleans + ")$", "i" ), - // For use in libraries implementing .is() - // We use this for POS matching in `select` - "needsContext": new RegExp( "^" + whitespace + "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + - whitespace + "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) - }, - - rinputs = /^(?:input|select|textarea|button)$/i, - rheader = /^h\d$/i, - - rnative = /^[^{]+\{\s*\[native \w/, - - // Easily-parseable/retrievable ID or TAG or CLASS selectors - rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, - - rsibling = /[+~]/, - rescape = /'|\\/g, - - // CSS escapes http://www.w3.org/TR/CSS21/syndata.html#escaped-characters - runescape = new RegExp( "\\\\([\\da-f]{1,6}" + whitespace + "?|(" + whitespace + ")|.)", "ig" ), - funescape = function( _, escaped, escapedWhitespace ) { - var high = "0x" + escaped - 0x10000; - // NaN means non-codepoint - // Support: Firefox<24 - // Workaround erroneous numeric interpretation of +"0x" - return high !== high || escapedWhitespace ? - escaped : - high < 0 ? - // BMP codepoint - String.fromCharCode( high + 0x10000 ) : - // Supplemental Plane codepoint (surrogate pair) - String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); - }; - -// Optimize for push.apply( _, NodeList ) -try { - push.apply( - (arr = slice.call( preferredDoc.childNodes )), - preferredDoc.childNodes - ); - // Support: Android<4.0 - // Detect silently failing push.apply - arr[ preferredDoc.childNodes.length ].nodeType; -} catch ( e ) { - push = { apply: arr.length ? - - // Leverage slice if possible - function( target, els ) { - push_native.apply( target, slice.call(els) ); - } : - - // Support: IE<9 - // Otherwise append directly - function( target, els ) { - var j = target.length, - i = 0; - // Can't trust NodeList.length - while ( (target[j++] = els[i++]) ) {} - target.length = j - 1; - } - }; -} - -function Sizzle( selector, context, results, seed ) { - var match, elem, m, nodeType, - // QSA vars - i, groups, old, nid, newContext, newSelector; - - if ( ( context ? context.ownerDocument || context : preferredDoc ) !== document ) { - setDocument( context ); - } - - context = context || document; - results = results || []; - - if ( !selector || typeof selector !== "string" ) { - return results; - } - - if ( (nodeType = context.nodeType) !== 1 && nodeType !== 9 ) { - return []; - } - - if ( documentIsHTML && !seed ) { - - // Shortcuts - if ( (match = rquickExpr.exec( selector )) ) { - // Speed-up: Sizzle("#ID") - if ( (m = match[1]) ) { - if ( nodeType === 9 ) { - elem = context.getElementById( m ); - // Check parentNode to catch when Blackberry 4.6 returns - // nodes that are no longer in the document (jQuery #6963) - if ( elem && elem.parentNode ) { - // Handle the case where IE, Opera, and Webkit return items - // by name instead of ID - if ( elem.id === m ) { - results.push( elem ); - return results; - } - } else { - return results; - } - } else { - // Context is not a document - if ( context.ownerDocument && (elem = context.ownerDocument.getElementById( m )) && - contains( context, elem ) && elem.id === m ) { - results.push( elem ); - return results; - } - } - - // Speed-up: Sizzle("TAG") - } else if ( match[2] ) { - push.apply( results, context.getElementsByTagName( selector ) ); - return results; - - // Speed-up: Sizzle(".CLASS") - } else if ( (m = match[3]) && support.getElementsByClassName && context.getElementsByClassName ) { - push.apply( results, context.getElementsByClassName( m ) ); - return results; - } - } - - // QSA path - if ( support.qsa && (!rbuggyQSA || !rbuggyQSA.test( selector )) ) { - nid = old = expando; - newContext = context; - newSelector = nodeType === 9 && selector; - - // qSA works strangely on Element-rooted queries - // We can work around this by specifying an extra ID on the root - // and working up from there (Thanks to Andrew Dupont for the technique) - // IE 8 doesn't work on object elements - if ( nodeType === 1 && context.nodeName.toLowerCase() !== "object" ) { - groups = tokenize( selector ); - - if ( (old = context.getAttribute("id")) ) { - nid = old.replace( rescape, "\\$&" ); - } else { - context.setAttribute( "id", nid ); - } - nid = "[id='" + nid + "'] "; - - i = groups.length; - while ( i-- ) { - groups[i] = nid + toSelector( groups[i] ); - } - newContext = rsibling.test( selector ) && testContext( context.parentNode ) || context; - newSelector = groups.join(","); - } - - if ( newSelector ) { - try { - push.apply( results, - newContext.querySelectorAll( newSelector ) - ); - return results; - } catch(qsaError) { - } finally { - if ( !old ) { - context.removeAttribute("id"); - } - } - } - } - } - - // All others - return select( selector.replace( rtrim, "$1" ), context, results, seed ); -} - -/** - * Create key-value caches of limited size - * @returns {Function(string, Object)} Returns the Object data after storing it on itself with - * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) - * deleting the oldest entry - */ -function createCache() { - var keys = []; - - function cache( key, value ) { - // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) - if ( keys.push( key + " " ) > Expr.cacheLength ) { - // Only keep the most recent entries - delete cache[ keys.shift() ]; - } - return (cache[ key + " " ] = value); - } - return cache; -} - -/** - * Mark a function for special use by Sizzle - * @param {Function} fn The function to mark - */ -function markFunction( fn ) { - fn[ expando ] = true; - return fn; -} - -/** - * Support testing using an element - * @param {Function} fn Passed the created div and expects a boolean result - */ -function assert( fn ) { - var div = document.createElement("div"); - - try { - return !!fn( div ); - } catch (e) { - return false; - } finally { - // Remove from its parent by default - if ( div.parentNode ) { - div.parentNode.removeChild( div ); - } - // release memory in IE - div = null; - } -} - -/** - * Adds the same handler for all of the specified attrs - * @param {String} attrs Pipe-separated list of attributes - * @param {Function} handler The method that will be applied - */ -function addHandle( attrs, handler ) { - var arr = attrs.split("|"), - i = attrs.length; - - while ( i-- ) { - Expr.attrHandle[ arr[i] ] = handler; - } -} - -/** - * Checks document order of two siblings - * @param {Element} a - * @param {Element} b - * @returns {Number} Returns less than 0 if a precedes b, greater than 0 if a follows b - */ -function siblingCheck( a, b ) { - var cur = b && a, - diff = cur && a.nodeType === 1 && b.nodeType === 1 && - ( ~b.sourceIndex || MAX_NEGATIVE ) - - ( ~a.sourceIndex || MAX_NEGATIVE ); - - // Use IE sourceIndex if available on both nodes - if ( diff ) { - return diff; - } - - // Check if b follows a - if ( cur ) { - while ( (cur = cur.nextSibling) ) { - if ( cur === b ) { - return -1; - } - } - } - - return a ? 1 : -1; -} - -/** - * Returns a function to use in pseudos for input types - * @param {String} type - */ -function createInputPseudo( type ) { - return function( elem ) { - var name = elem.nodeName.toLowerCase(); - return name === "input" && elem.type === type; - }; -} - -/** - * Returns a function to use in pseudos for buttons - * @param {String} type - */ -function createButtonPseudo( type ) { - return function( elem ) { - var name = elem.nodeName.toLowerCase(); - return (name === "input" || name === "button") && elem.type === type; - }; -} - -/** - * Returns a function to use in pseudos for positionals - * @param {Function} fn - */ -function createPositionalPseudo( fn ) { - return markFunction(function( argument ) { - argument = +argument; - return markFunction(function( seed, matches ) { - var j, - matchIndexes = fn( [], seed.length, argument ), - i = matchIndexes.length; - - // Match elements found at the specified indexes - while ( i-- ) { - if ( seed[ (j = matchIndexes[i]) ] ) { - seed[j] = !(matches[j] = seed[j]); - } - } - }); - }); -} - -/** - * Checks a node for validity as a Sizzle context - * @param {Element|Object=} context - * @returns {Element|Object|Boolean} The input node if acceptable, otherwise a falsy value - */ -function testContext( context ) { - return context && typeof context.getElementsByTagName !== strundefined && context; -} - -// Expose support vars for convenience -support = Sizzle.support = {}; - -/** - * Detects XML nodes - * @param {Element|Object} elem An element or a document - * @returns {Boolean} True iff elem is a non-HTML XML node - */ -isXML = Sizzle.isXML = function( elem ) { - // documentElement is verified for cases where it doesn't yet exist - // (such as loading iframes in IE - #4833) - var documentElement = elem && (elem.ownerDocument || elem).documentElement; - return documentElement ? documentElement.nodeName !== "HTML" : false; -}; - -/** - * Sets document-related variables once based on the current document - * @param {Element|Object} [doc] An element or document object to use to set the document - * @returns {Object} Returns the current document - */ -setDocument = Sizzle.setDocument = function( node ) { - var hasCompare, - doc = node ? node.ownerDocument || node : preferredDoc, - parent = doc.defaultView; - - // If no document and documentElement is available, return - if ( doc === document || doc.nodeType !== 9 || !doc.documentElement ) { - return document; - } - - // Set our document - document = doc; - docElem = doc.documentElement; - - // Support tests - documentIsHTML = !isXML( doc ); - - // Support: IE>8 - // If iframe document is assigned to "document" variable and if iframe has been reloaded, - // IE will throw "permission denied" error when accessing "document" variable, see jQuery #13936 - // IE6-8 do not support the defaultView property so parent will be undefined - if ( parent && parent !== parent.top ) { - // IE11 does not have attachEvent, so all must suffer - if ( parent.addEventListener ) { - parent.addEventListener( "unload", function() { - setDocument(); - }, false ); - } else if ( parent.attachEvent ) { - parent.attachEvent( "onunload", function() { - setDocument(); - }); - } - } - - /* Attributes - ---------------------------------------------------------------------- */ - - // Support: IE<8 - // Verify that getAttribute really returns attributes and not properties (excepting IE8 booleans) - support.attributes = assert(function( div ) { - div.className = "i"; - return !div.getAttribute("className"); - }); - - /* getElement(s)By* - ---------------------------------------------------------------------- */ - - // Check if getElementsByTagName("*") returns only elements - support.getElementsByTagName = assert(function( div ) { - div.appendChild( doc.createComment("") ); - return !div.getElementsByTagName("*").length; - }); - - // Check if getElementsByClassName can be trusted - support.getElementsByClassName = rnative.test( doc.getElementsByClassName ) && assert(function( div ) { - div.innerHTML = "
"; - - // Support: Safari<4 - // Catch class over-caching - div.firstChild.className = "i"; - // Support: Opera<10 - // Catch gEBCN failure to find non-leading classes - return div.getElementsByClassName("i").length === 2; - }); - - // Support: IE<10 - // Check if getElementById returns elements by name - // The broken getElementById methods don't pick up programatically-set names, - // so use a roundabout getElementsByName test - support.getById = assert(function( div ) { - docElem.appendChild( div ).id = expando; - return !doc.getElementsByName || !doc.getElementsByName( expando ).length; - }); - - // ID find and filter - if ( support.getById ) { - Expr.find["ID"] = function( id, context ) { - if ( typeof context.getElementById !== strundefined && documentIsHTML ) { - var m = context.getElementById( id ); - // Check parentNode to catch when Blackberry 4.6 returns - // nodes that are no longer in the document #6963 - return m && m.parentNode ? [ m ] : []; - } - }; - Expr.filter["ID"] = function( id ) { - var attrId = id.replace( runescape, funescape ); - return function( elem ) { - return elem.getAttribute("id") === attrId; - }; - }; - } else { - // Support: IE6/7 - // getElementById is not reliable as a find shortcut - delete Expr.find["ID"]; - - Expr.filter["ID"] = function( id ) { - var attrId = id.replace( runescape, funescape ); - return function( elem ) { - var node = typeof elem.getAttributeNode !== strundefined && elem.getAttributeNode("id"); - return node && node.value === attrId; - }; - }; - } - - // Tag - Expr.find["TAG"] = support.getElementsByTagName ? - function( tag, context ) { - if ( typeof context.getElementsByTagName !== strundefined ) { - return context.getElementsByTagName( tag ); - } - } : - function( tag, context ) { - var elem, - tmp = [], - i = 0, - results = context.getElementsByTagName( tag ); - - // Filter out possible comments - if ( tag === "*" ) { - while ( (elem = results[i++]) ) { - if ( elem.nodeType === 1 ) { - tmp.push( elem ); - } - } - - return tmp; - } - return results; - }; - - // Class - Expr.find["CLASS"] = support.getElementsByClassName && function( className, context ) { - if ( typeof context.getElementsByClassName !== strundefined && documentIsHTML ) { - return context.getElementsByClassName( className ); - } - }; - - /* QSA/matchesSelector - ---------------------------------------------------------------------- */ - - // QSA and matchesSelector support - - // matchesSelector(:active) reports false when true (IE9/Opera 11.5) - rbuggyMatches = []; - - // qSa(:focus) reports false when true (Chrome 21) - // We allow this because of a bug in IE8/9 that throws an error - // whenever `document.activeElement` is accessed on an iframe - // So, we allow :focus to pass through QSA all the time to avoid the IE error - // See http://bugs.jquery.com/ticket/13378 - rbuggyQSA = []; - - if ( (support.qsa = rnative.test( doc.querySelectorAll )) ) { - // Build QSA regex - // Regex strategy adopted from Diego Perini - assert(function( div ) { - // Select is set to empty string on purpose - // This is to test IE's treatment of not explicitly - // setting a boolean content attribute, - // since its presence should be enough - // http://bugs.jquery.com/ticket/12359 - div.innerHTML = ""; - - // Support: IE8, Opera 11-12.16 - // Nothing should be selected when empty strings follow ^= or $= or *= - // The test attribute must be unknown in Opera but "safe" for WinRT - // http://msdn.microsoft.com/en-us/library/ie/hh465388.aspx#attribute_section - if ( div.querySelectorAll("[msallowclip^='']").length ) { - rbuggyQSA.push( "[*^$]=" + whitespace + "*(?:''|\"\")" ); - } - - // Support: IE8 - // Boolean attributes and "value" are not treated correctly - if ( !div.querySelectorAll("[selected]").length ) { - rbuggyQSA.push( "\\[" + whitespace + "*(?:value|" + booleans + ")" ); - } - - // Webkit/Opera - :checked should return selected option elements - // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked - // IE8 throws error here and will not see later tests - if ( !div.querySelectorAll(":checked").length ) { - rbuggyQSA.push(":checked"); - } - }); - - assert(function( div ) { - // Support: Windows 8 Native Apps - // The type and name attributes are restricted during .innerHTML assignment - var input = doc.createElement("input"); - input.setAttribute( "type", "hidden" ); - div.appendChild( input ).setAttribute( "name", "D" ); - - // Support: IE8 - // Enforce case-sensitivity of name attribute - if ( div.querySelectorAll("[name=d]").length ) { - rbuggyQSA.push( "name" + whitespace + "*[*^$|!~]?=" ); - } - - // FF 3.5 - :enabled/:disabled and hidden elements (hidden elements are still enabled) - // IE8 throws error here and will not see later tests - if ( !div.querySelectorAll(":enabled").length ) { - rbuggyQSA.push( ":enabled", ":disabled" ); - } - - // Opera 10-11 does not throw on post-comma invalid pseudos - div.querySelectorAll("*,:x"); - rbuggyQSA.push(",.*:"); - }); - } - - if ( (support.matchesSelector = rnative.test( (matches = docElem.matches || - docElem.webkitMatchesSelector || - docElem.mozMatchesSelector || - docElem.oMatchesSelector || - docElem.msMatchesSelector) )) ) { - - assert(function( div ) { - // Check to see if it's possible to do matchesSelector - // on a disconnected node (IE 9) - support.disconnectedMatch = matches.call( div, "div" ); - - // This should fail with an exception - // Gecko does not error, returns false instead - matches.call( div, "[s!='']:x" ); - rbuggyMatches.push( "!=", pseudos ); - }); - } - - rbuggyQSA = rbuggyQSA.length && new RegExp( rbuggyQSA.join("|") ); - rbuggyMatches = rbuggyMatches.length && new RegExp( rbuggyMatches.join("|") ); - - /* Contains - ---------------------------------------------------------------------- */ - hasCompare = rnative.test( docElem.compareDocumentPosition ); - - // Element contains another - // Purposefully does not implement inclusive descendent - // As in, an element does not contain itself - contains = hasCompare || rnative.test( docElem.contains ) ? - function( a, b ) { - var adown = a.nodeType === 9 ? a.documentElement : a, - bup = b && b.parentNode; - return a === bup || !!( bup && bup.nodeType === 1 && ( - adown.contains ? - adown.contains( bup ) : - a.compareDocumentPosition && a.compareDocumentPosition( bup ) & 16 - )); - } : - function( a, b ) { - if ( b ) { - while ( (b = b.parentNode) ) { - if ( b === a ) { - return true; - } - } - } - return false; - }; - - /* Sorting - ---------------------------------------------------------------------- */ - - // Document order sorting - sortOrder = hasCompare ? - function( a, b ) { - - // Flag for duplicate removal - if ( a === b ) { - hasDuplicate = true; - return 0; - } - - // Sort on method existence if only one input has compareDocumentPosition - var compare = !a.compareDocumentPosition - !b.compareDocumentPosition; - if ( compare ) { - return compare; - } - - // Calculate position if both inputs belong to the same document - compare = ( a.ownerDocument || a ) === ( b.ownerDocument || b ) ? - a.compareDocumentPosition( b ) : - - // Otherwise we know they are disconnected - 1; - - // Disconnected nodes - if ( compare & 1 || - (!support.sortDetached && b.compareDocumentPosition( a ) === compare) ) { - - // Choose the first element that is related to our preferred document - if ( a === doc || a.ownerDocument === preferredDoc && contains(preferredDoc, a) ) { - return -1; - } - if ( b === doc || b.ownerDocument === preferredDoc && contains(preferredDoc, b) ) { - return 1; - } - - // Maintain original order - return sortInput ? - ( indexOf.call( sortInput, a ) - indexOf.call( sortInput, b ) ) : - 0; - } - - return compare & 4 ? -1 : 1; - } : - function( a, b ) { - // Exit early if the nodes are identical - if ( a === b ) { - hasDuplicate = true; - return 0; - } - - var cur, - i = 0, - aup = a.parentNode, - bup = b.parentNode, - ap = [ a ], - bp = [ b ]; - - // Parentless nodes are either documents or disconnected - if ( !aup || !bup ) { - return a === doc ? -1 : - b === doc ? 1 : - aup ? -1 : - bup ? 1 : - sortInput ? - ( indexOf.call( sortInput, a ) - indexOf.call( sortInput, b ) ) : - 0; - - // If the nodes are siblings, we can do a quick check - } else if ( aup === bup ) { - return siblingCheck( a, b ); - } - - // Otherwise we need full lists of their ancestors for comparison - cur = a; - while ( (cur = cur.parentNode) ) { - ap.unshift( cur ); - } - cur = b; - while ( (cur = cur.parentNode) ) { - bp.unshift( cur ); - } - - // Walk down the tree looking for a discrepancy - while ( ap[i] === bp[i] ) { - i++; - } - - return i ? - // Do a sibling check if the nodes have a common ancestor - siblingCheck( ap[i], bp[i] ) : - - // Otherwise nodes in our document sort first - ap[i] === preferredDoc ? -1 : - bp[i] === preferredDoc ? 1 : - 0; - }; - - return doc; -}; - -Sizzle.matches = function( expr, elements ) { - return Sizzle( expr, null, null, elements ); -}; - -Sizzle.matchesSelector = function( elem, expr ) { - // Set document vars if needed - if ( ( elem.ownerDocument || elem ) !== document ) { - setDocument( elem ); - } - - // Make sure that attribute selectors are quoted - expr = expr.replace( rattributeQuotes, "='$1']" ); - - if ( support.matchesSelector && documentIsHTML && - ( !rbuggyMatches || !rbuggyMatches.test( expr ) ) && - ( !rbuggyQSA || !rbuggyQSA.test( expr ) ) ) { - - try { - var ret = matches.call( elem, expr ); - - // IE 9's matchesSelector returns false on disconnected nodes - if ( ret || support.disconnectedMatch || - // As well, disconnected nodes are said to be in a document - // fragment in IE 9 - elem.document && elem.document.nodeType !== 11 ) { - return ret; - } - } catch(e) {} - } - - return Sizzle( expr, document, null, [ elem ] ).length > 0; -}; - -Sizzle.contains = function( context, elem ) { - // Set document vars if needed - if ( ( context.ownerDocument || context ) !== document ) { - setDocument( context ); - } - return contains( context, elem ); -}; - -Sizzle.attr = function( elem, name ) { - // Set document vars if needed - if ( ( elem.ownerDocument || elem ) !== document ) { - setDocument( elem ); - } - - var fn = Expr.attrHandle[ name.toLowerCase() ], - // Don't get fooled by Object.prototype properties (jQuery #13807) - val = fn && hasOwn.call( Expr.attrHandle, name.toLowerCase() ) ? - fn( elem, name, !documentIsHTML ) : - undefined; - - return val !== undefined ? - val : - support.attributes || !documentIsHTML ? - elem.getAttribute( name ) : - (val = elem.getAttributeNode(name)) && val.specified ? - val.value : - null; -}; - -Sizzle.error = function( msg ) { - throw new Error( "Syntax error, unrecognized expression: " + msg ); -}; - -/** - * Document sorting and removing duplicates - * @param {ArrayLike} results - */ -Sizzle.uniqueSort = function( results ) { - var elem, - duplicates = [], - j = 0, - i = 0; - - // Unless we *know* we can detect duplicates, assume their presence - hasDuplicate = !support.detectDuplicates; - sortInput = !support.sortStable && results.slice( 0 ); - results.sort( sortOrder ); - - if ( hasDuplicate ) { - while ( (elem = results[i++]) ) { - if ( elem === results[ i ] ) { - j = duplicates.push( i ); - } - } - while ( j-- ) { - results.splice( duplicates[ j ], 1 ); - } - } - - // Clear input after sorting to release objects - // See https://github.com/jquery/sizzle/pull/225 - sortInput = null; - - return results; -}; - -/** - * Utility function for retrieving the text value of an array of DOM nodes - * @param {Array|Element} elem - */ -getText = Sizzle.getText = function( elem ) { - var node, - ret = "", - i = 0, - nodeType = elem.nodeType; - - if ( !nodeType ) { - // If no nodeType, this is expected to be an array - while ( (node = elem[i++]) ) { - // Do not traverse comment nodes - ret += getText( node ); - } - } else if ( nodeType === 1 || nodeType === 9 || nodeType === 11 ) { - // Use textContent for elements - // innerText usage removed for consistency of new lines (jQuery #11153) - if ( typeof elem.textContent === "string" ) { - return elem.textContent; - } else { - // Traverse its children - for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { - ret += getText( elem ); - } - } - } else if ( nodeType === 3 || nodeType === 4 ) { - return elem.nodeValue; - } - // Do not include comment or processing instruction nodes - - return ret; -}; - -Expr = Sizzle.selectors = { - - // Can be adjusted by the user - cacheLength: 50, - - createPseudo: markFunction, - - match: matchExpr, - - attrHandle: {}, - - find: {}, - - relative: { - ">": { dir: "parentNode", first: true }, - " ": { dir: "parentNode" }, - "+": { dir: "previousSibling", first: true }, - "~": { dir: "previousSibling" } - }, - - preFilter: { - "ATTR": function( match ) { - match[1] = match[1].replace( runescape, funescape ); - - // Move the given value to match[3] whether quoted or unquoted - match[3] = ( match[3] || match[4] || match[5] || "" ).replace( runescape, funescape ); - - if ( match[2] === "~=" ) { - match[3] = " " + match[3] + " "; - } - - return match.slice( 0, 4 ); - }, - - "CHILD": function( match ) { - /* matches from matchExpr["CHILD"] - 1 type (only|nth|...) - 2 what (child|of-type) - 3 argument (even|odd|\d*|\d*n([+-]\d+)?|...) - 4 xn-component of xn+y argument ([+-]?\d*n|) - 5 sign of xn-component - 6 x of xn-component - 7 sign of y-component - 8 y of y-component - */ - match[1] = match[1].toLowerCase(); - - if ( match[1].slice( 0, 3 ) === "nth" ) { - // nth-* requires argument - if ( !match[3] ) { - Sizzle.error( match[0] ); - } - - // numeric x and y parameters for Expr.filter.CHILD - // remember that false/true cast respectively to 0/1 - match[4] = +( match[4] ? match[5] + (match[6] || 1) : 2 * ( match[3] === "even" || match[3] === "odd" ) ); - match[5] = +( ( match[7] + match[8] ) || match[3] === "odd" ); - - // other types prohibit arguments - } else if ( match[3] ) { - Sizzle.error( match[0] ); - } - - return match; - }, - - "PSEUDO": function( match ) { - var excess, - unquoted = !match[6] && match[2]; - - if ( matchExpr["CHILD"].test( match[0] ) ) { - return null; - } - - // Accept quoted arguments as-is - if ( match[3] ) { - match[2] = match[4] || match[5] || ""; - - // Strip excess characters from unquoted arguments - } else if ( unquoted && rpseudo.test( unquoted ) && - // Get excess from tokenize (recursively) - (excess = tokenize( unquoted, true )) && - // advance to the next closing parenthesis - (excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length) ) { - - // excess is a negative index - match[0] = match[0].slice( 0, excess ); - match[2] = unquoted.slice( 0, excess ); - } - - // Return only captures needed by the pseudo filter method (type and argument) - return match.slice( 0, 3 ); - } - }, - - filter: { - - "TAG": function( nodeNameSelector ) { - var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase(); - return nodeNameSelector === "*" ? - function() { return true; } : - function( elem ) { - return elem.nodeName && elem.nodeName.toLowerCase() === nodeName; - }; - }, - - "CLASS": function( className ) { - var pattern = classCache[ className + " " ]; - - return pattern || - (pattern = new RegExp( "(^|" + whitespace + ")" + className + "(" + whitespace + "|$)" )) && - classCache( className, function( elem ) { - return pattern.test( typeof elem.className === "string" && elem.className || typeof elem.getAttribute !== strundefined && elem.getAttribute("class") || "" ); - }); - }, - - "ATTR": function( name, operator, check ) { - return function( elem ) { - var result = Sizzle.attr( elem, name ); - - if ( result == null ) { - return operator === "!="; - } - if ( !operator ) { - return true; - } - - result += ""; - - return operator === "=" ? result === check : - operator === "!=" ? result !== check : - operator === "^=" ? check && result.indexOf( check ) === 0 : - operator === "*=" ? check && result.indexOf( check ) > -1 : - operator === "$=" ? check && result.slice( -check.length ) === check : - operator === "~=" ? ( " " + result + " " ).indexOf( check ) > -1 : - operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" : - false; - }; - }, - - "CHILD": function( type, what, argument, first, last ) { - var simple = type.slice( 0, 3 ) !== "nth", - forward = type.slice( -4 ) !== "last", - ofType = what === "of-type"; - - return first === 1 && last === 0 ? - - // Shortcut for :nth-*(n) - function( elem ) { - return !!elem.parentNode; - } : - - function( elem, context, xml ) { - var cache, outerCache, node, diff, nodeIndex, start, - dir = simple !== forward ? "nextSibling" : "previousSibling", - parent = elem.parentNode, - name = ofType && elem.nodeName.toLowerCase(), - useCache = !xml && !ofType; - - if ( parent ) { - - // :(first|last|only)-(child|of-type) - if ( simple ) { - while ( dir ) { - node = elem; - while ( (node = node[ dir ]) ) { - if ( ofType ? node.nodeName.toLowerCase() === name : node.nodeType === 1 ) { - return false; - } - } - // Reverse direction for :only-* (if we haven't yet done so) - start = dir = type === "only" && !start && "nextSibling"; - } - return true; - } - - start = [ forward ? parent.firstChild : parent.lastChild ]; - - // non-xml :nth-child(...) stores cache data on `parent` - if ( forward && useCache ) { - // Seek `elem` from a previously-cached index - outerCache = parent[ expando ] || (parent[ expando ] = {}); - cache = outerCache[ type ] || []; - nodeIndex = cache[0] === dirruns && cache[1]; - diff = cache[0] === dirruns && cache[2]; - node = nodeIndex && parent.childNodes[ nodeIndex ]; - - while ( (node = ++nodeIndex && node && node[ dir ] || - - // Fallback to seeking `elem` from the start - (diff = nodeIndex = 0) || start.pop()) ) { - - // When found, cache indexes on `parent` and break - if ( node.nodeType === 1 && ++diff && node === elem ) { - outerCache[ type ] = [ dirruns, nodeIndex, diff ]; - break; - } - } - - // Use previously-cached element index if available - } else if ( useCache && (cache = (elem[ expando ] || (elem[ expando ] = {}))[ type ]) && cache[0] === dirruns ) { - diff = cache[1]; - - // xml :nth-child(...) or :nth-last-child(...) or :nth(-last)?-of-type(...) - } else { - // Use the same loop as above to seek `elem` from the start - while ( (node = ++nodeIndex && node && node[ dir ] || - (diff = nodeIndex = 0) || start.pop()) ) { - - if ( ( ofType ? node.nodeName.toLowerCase() === name : node.nodeType === 1 ) && ++diff ) { - // Cache the index of each encountered element - if ( useCache ) { - (node[ expando ] || (node[ expando ] = {}))[ type ] = [ dirruns, diff ]; - } - - if ( node === elem ) { - break; - } - } - } - } - - // Incorporate the offset, then check against cycle size - diff -= last; - return diff === first || ( diff % first === 0 && diff / first >= 0 ); - } - }; - }, - - "PSEUDO": function( pseudo, argument ) { - // pseudo-class names are case-insensitive - // http://www.w3.org/TR/selectors/#pseudo-classes - // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters - // Remember that setFilters inherits from pseudos - var args, - fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] || - Sizzle.error( "unsupported pseudo: " + pseudo ); - - // The user may use createPseudo to indicate that - // arguments are needed to create the filter function - // just as Sizzle does - if ( fn[ expando ] ) { - return fn( argument ); - } - - // But maintain support for old signatures - if ( fn.length > 1 ) { - args = [ pseudo, pseudo, "", argument ]; - return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ? - markFunction(function( seed, matches ) { - var idx, - matched = fn( seed, argument ), - i = matched.length; - while ( i-- ) { - idx = indexOf.call( seed, matched[i] ); - seed[ idx ] = !( matches[ idx ] = matched[i] ); - } - }) : - function( elem ) { - return fn( elem, 0, args ); - }; - } - - return fn; - } - }, - - pseudos: { - // Potentially complex pseudos - "not": markFunction(function( selector ) { - // Trim the selector passed to compile - // to avoid treating leading and trailing - // spaces as combinators - var input = [], - results = [], - matcher = compile( selector.replace( rtrim, "$1" ) ); - - return matcher[ expando ] ? - markFunction(function( seed, matches, context, xml ) { - var elem, - unmatched = matcher( seed, null, xml, [] ), - i = seed.length; - - // Match elements unmatched by `matcher` - while ( i-- ) { - if ( (elem = unmatched[i]) ) { - seed[i] = !(matches[i] = elem); - } - } - }) : - function( elem, context, xml ) { - input[0] = elem; - matcher( input, null, xml, results ); - return !results.pop(); - }; - }), - - "has": markFunction(function( selector ) { - return function( elem ) { - return Sizzle( selector, elem ).length > 0; - }; - }), - - "contains": markFunction(function( text ) { - return function( elem ) { - return ( elem.textContent || elem.innerText || getText( elem ) ).indexOf( text ) > -1; - }; - }), - - // "Whether an element is represented by a :lang() selector - // is based solely on the element's language value - // being equal to the identifier C, - // or beginning with the identifier C immediately followed by "-". - // The matching of C against the element's language value is performed case-insensitively. - // The identifier C does not have to be a valid language name." - // http://www.w3.org/TR/selectors/#lang-pseudo - "lang": markFunction( function( lang ) { - // lang value must be a valid identifier - if ( !ridentifier.test(lang || "") ) { - Sizzle.error( "unsupported lang: " + lang ); - } - lang = lang.replace( runescape, funescape ).toLowerCase(); - return function( elem ) { - var elemLang; - do { - if ( (elemLang = documentIsHTML ? - elem.lang : - elem.getAttribute("xml:lang") || elem.getAttribute("lang")) ) { - - elemLang = elemLang.toLowerCase(); - return elemLang === lang || elemLang.indexOf( lang + "-" ) === 0; - } - } while ( (elem = elem.parentNode) && elem.nodeType === 1 ); - return false; - }; - }), - - // Miscellaneous - "target": function( elem ) { - var hash = window.location && window.location.hash; - return hash && hash.slice( 1 ) === elem.id; - }, - - "root": function( elem ) { - return elem === docElem; - }, - - "focus": function( elem ) { - return elem === document.activeElement && (!document.hasFocus || document.hasFocus()) && !!(elem.type || elem.href || ~elem.tabIndex); - }, - - // Boolean properties - "enabled": function( elem ) { - return elem.disabled === false; - }, - - "disabled": function( elem ) { - return elem.disabled === true; - }, - - "checked": function( elem ) { - // In CSS3, :checked should return both checked and selected elements - // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked - var nodeName = elem.nodeName.toLowerCase(); - return (nodeName === "input" && !!elem.checked) || (nodeName === "option" && !!elem.selected); - }, - - "selected": function( elem ) { - // Accessing this property makes selected-by-default - // options in Safari work properly - if ( elem.parentNode ) { - elem.parentNode.selectedIndex; - } - - return elem.selected === true; - }, - - // Contents - "empty": function( elem ) { - // http://www.w3.org/TR/selectors/#empty-pseudo - // :empty is negated by element (1) or content nodes (text: 3; cdata: 4; entity ref: 5), - // but not by others (comment: 8; processing instruction: 7; etc.) - // nodeType < 6 works because attributes (2) do not appear as children - for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { - if ( elem.nodeType < 6 ) { - return false; - } - } - return true; - }, - - "parent": function( elem ) { - return !Expr.pseudos["empty"]( elem ); - }, - - // Element/input types - "header": function( elem ) { - return rheader.test( elem.nodeName ); - }, - - "input": function( elem ) { - return rinputs.test( elem.nodeName ); - }, - - "button": function( elem ) { - var name = elem.nodeName.toLowerCase(); - return name === "input" && elem.type === "button" || name === "button"; - }, - - "text": function( elem ) { - var attr; - return elem.nodeName.toLowerCase() === "input" && - elem.type === "text" && - - // Support: IE<8 - // New HTML5 attribute values (e.g., "search") appear with elem.type === "text" - ( (attr = elem.getAttribute("type")) == null || attr.toLowerCase() === "text" ); - }, - - // Position-in-collection - "first": createPositionalPseudo(function() { - return [ 0 ]; - }), - - "last": createPositionalPseudo(function( matchIndexes, length ) { - return [ length - 1 ]; - }), - - "eq": createPositionalPseudo(function( matchIndexes, length, argument ) { - return [ argument < 0 ? argument + length : argument ]; - }), - - "even": createPositionalPseudo(function( matchIndexes, length ) { - var i = 0; - for ( ; i < length; i += 2 ) { - matchIndexes.push( i ); - } - return matchIndexes; - }), - - "odd": createPositionalPseudo(function( matchIndexes, length ) { - var i = 1; - for ( ; i < length; i += 2 ) { - matchIndexes.push( i ); - } - return matchIndexes; - }), - - "lt": createPositionalPseudo(function( matchIndexes, length, argument ) { - var i = argument < 0 ? argument + length : argument; - for ( ; --i >= 0; ) { - matchIndexes.push( i ); - } - return matchIndexes; - }), - - "gt": createPositionalPseudo(function( matchIndexes, length, argument ) { - var i = argument < 0 ? argument + length : argument; - for ( ; ++i < length; ) { - matchIndexes.push( i ); - } - return matchIndexes; - }) - } -}; - -Expr.pseudos["nth"] = Expr.pseudos["eq"]; - -// Add button/input type pseudos -for ( i in { radio: true, checkbox: true, file: true, password: true, image: true } ) { - Expr.pseudos[ i ] = createInputPseudo( i ); -} -for ( i in { submit: true, reset: true } ) { - Expr.pseudos[ i ] = createButtonPseudo( i ); -} - -// Easy API for creating new setFilters -function setFilters() {} -setFilters.prototype = Expr.filters = Expr.pseudos; -Expr.setFilters = new setFilters(); - -tokenize = Sizzle.tokenize = function( selector, parseOnly ) { - var matched, match, tokens, type, - soFar, groups, preFilters, - cached = tokenCache[ selector + " " ]; - - if ( cached ) { - return parseOnly ? 0 : cached.slice( 0 ); - } - - soFar = selector; - groups = []; - preFilters = Expr.preFilter; - - while ( soFar ) { - - // Comma and first run - if ( !matched || (match = rcomma.exec( soFar )) ) { - if ( match ) { - // Don't consume trailing commas as valid - soFar = soFar.slice( match[0].length ) || soFar; - } - groups.push( (tokens = []) ); - } - - matched = false; - - // Combinators - if ( (match = rcombinators.exec( soFar )) ) { - matched = match.shift(); - tokens.push({ - value: matched, - // Cast descendant combinators to space - type: match[0].replace( rtrim, " " ) - }); - soFar = soFar.slice( matched.length ); - } - - // Filters - for ( type in Expr.filter ) { - if ( (match = matchExpr[ type ].exec( soFar )) && (!preFilters[ type ] || - (match = preFilters[ type ]( match ))) ) { - matched = match.shift(); - tokens.push({ - value: matched, - type: type, - matches: match - }); - soFar = soFar.slice( matched.length ); - } - } - - if ( !matched ) { - break; - } - } - - // Return the length of the invalid excess - // if we're just parsing - // Otherwise, throw an error or return tokens - return parseOnly ? - soFar.length : - soFar ? - Sizzle.error( selector ) : - // Cache the tokens - tokenCache( selector, groups ).slice( 0 ); -}; - -function toSelector( tokens ) { - var i = 0, - len = tokens.length, - selector = ""; - for ( ; i < len; i++ ) { - selector += tokens[i].value; - } - return selector; -} - -function addCombinator( matcher, combinator, base ) { - var dir = combinator.dir, - checkNonElements = base && dir === "parentNode", - doneName = done++; - - return combinator.first ? - // Check against closest ancestor/preceding element - function( elem, context, xml ) { - while ( (elem = elem[ dir ]) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - return matcher( elem, context, xml ); - } - } - } : - - // Check against all ancestor/preceding elements - function( elem, context, xml ) { - var oldCache, outerCache, - newCache = [ dirruns, doneName ]; - - // We can't set arbitrary data on XML nodes, so they don't benefit from dir caching - if ( xml ) { - while ( (elem = elem[ dir ]) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - if ( matcher( elem, context, xml ) ) { - return true; - } - } - } - } else { - while ( (elem = elem[ dir ]) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - outerCache = elem[ expando ] || (elem[ expando ] = {}); - if ( (oldCache = outerCache[ dir ]) && - oldCache[ 0 ] === dirruns && oldCache[ 1 ] === doneName ) { - - // Assign to newCache so results back-propagate to previous elements - return (newCache[ 2 ] = oldCache[ 2 ]); - } else { - // Reuse newcache so results back-propagate to previous elements - outerCache[ dir ] = newCache; - - // A match means we're done; a fail means we have to keep checking - if ( (newCache[ 2 ] = matcher( elem, context, xml )) ) { - return true; - } - } - } - } - } - }; -} - -function elementMatcher( matchers ) { - return matchers.length > 1 ? - function( elem, context, xml ) { - var i = matchers.length; - while ( i-- ) { - if ( !matchers[i]( elem, context, xml ) ) { - return false; - } - } - return true; - } : - matchers[0]; -} - -function multipleContexts( selector, contexts, results ) { - var i = 0, - len = contexts.length; - for ( ; i < len; i++ ) { - Sizzle( selector, contexts[i], results ); - } - return results; -} - -function condense( unmatched, map, filter, context, xml ) { - var elem, - newUnmatched = [], - i = 0, - len = unmatched.length, - mapped = map != null; - - for ( ; i < len; i++ ) { - if ( (elem = unmatched[i]) ) { - if ( !filter || filter( elem, context, xml ) ) { - newUnmatched.push( elem ); - if ( mapped ) { - map.push( i ); - } - } - } - } - - return newUnmatched; -} - -function setMatcher( preFilter, selector, matcher, postFilter, postFinder, postSelector ) { - if ( postFilter && !postFilter[ expando ] ) { - postFilter = setMatcher( postFilter ); - } - if ( postFinder && !postFinder[ expando ] ) { - postFinder = setMatcher( postFinder, postSelector ); - } - return markFunction(function( seed, results, context, xml ) { - var temp, i, elem, - preMap = [], - postMap = [], - preexisting = results.length, - - // Get initial elements from seed or context - elems = seed || multipleContexts( selector || "*", context.nodeType ? [ context ] : context, [] ), - - // Prefilter to get matcher input, preserving a map for seed-results synchronization - matcherIn = preFilter && ( seed || !selector ) ? - condense( elems, preMap, preFilter, context, xml ) : - elems, - - matcherOut = matcher ? - // If we have a postFinder, or filtered seed, or non-seed postFilter or preexisting results, - postFinder || ( seed ? preFilter : preexisting || postFilter ) ? - - // ...intermediate processing is necessary - [] : - - // ...otherwise use results directly - results : - matcherIn; - - // Find primary matches - if ( matcher ) { - matcher( matcherIn, matcherOut, context, xml ); - } - - // Apply postFilter - if ( postFilter ) { - temp = condense( matcherOut, postMap ); - postFilter( temp, [], context, xml ); - - // Un-match failing elements by moving them back to matcherIn - i = temp.length; - while ( i-- ) { - if ( (elem = temp[i]) ) { - matcherOut[ postMap[i] ] = !(matcherIn[ postMap[i] ] = elem); - } - } - } - - if ( seed ) { - if ( postFinder || preFilter ) { - if ( postFinder ) { - // Get the final matcherOut by condensing this intermediate into postFinder contexts - temp = []; - i = matcherOut.length; - while ( i-- ) { - if ( (elem = matcherOut[i]) ) { - // Restore matcherIn since elem is not yet a final match - temp.push( (matcherIn[i] = elem) ); - } - } - postFinder( null, (matcherOut = []), temp, xml ); - } - - // Move matched elements from seed to results to keep them synchronized - i = matcherOut.length; - while ( i-- ) { - if ( (elem = matcherOut[i]) && - (temp = postFinder ? indexOf.call( seed, elem ) : preMap[i]) > -1 ) { - - seed[temp] = !(results[temp] = elem); - } - } - } - - // Add elements to results, through postFinder if defined - } else { - matcherOut = condense( - matcherOut === results ? - matcherOut.splice( preexisting, matcherOut.length ) : - matcherOut - ); - if ( postFinder ) { - postFinder( null, results, matcherOut, xml ); - } else { - push.apply( results, matcherOut ); - } - } - }); -} - -function matcherFromTokens( tokens ) { - var checkContext, matcher, j, - len = tokens.length, - leadingRelative = Expr.relative[ tokens[0].type ], - implicitRelative = leadingRelative || Expr.relative[" "], - i = leadingRelative ? 1 : 0, - - // The foundational matcher ensures that elements are reachable from top-level context(s) - matchContext = addCombinator( function( elem ) { - return elem === checkContext; - }, implicitRelative, true ), - matchAnyContext = addCombinator( function( elem ) { - return indexOf.call( checkContext, elem ) > -1; - }, implicitRelative, true ), - matchers = [ function( elem, context, xml ) { - return ( !leadingRelative && ( xml || context !== outermostContext ) ) || ( - (checkContext = context).nodeType ? - matchContext( elem, context, xml ) : - matchAnyContext( elem, context, xml ) ); - } ]; - - for ( ; i < len; i++ ) { - if ( (matcher = Expr.relative[ tokens[i].type ]) ) { - matchers = [ addCombinator(elementMatcher( matchers ), matcher) ]; - } else { - matcher = Expr.filter[ tokens[i].type ].apply( null, tokens[i].matches ); - - // Return special upon seeing a positional matcher - if ( matcher[ expando ] ) { - // Find the next relative operator (if any) for proper handling - j = ++i; - for ( ; j < len; j++ ) { - if ( Expr.relative[ tokens[j].type ] ) { - break; - } - } - return setMatcher( - i > 1 && elementMatcher( matchers ), - i > 1 && toSelector( - // If the preceding token was a descendant combinator, insert an implicit any-element `*` - tokens.slice( 0, i - 1 ).concat({ value: tokens[ i - 2 ].type === " " ? "*" : "" }) - ).replace( rtrim, "$1" ), - matcher, - i < j && matcherFromTokens( tokens.slice( i, j ) ), - j < len && matcherFromTokens( (tokens = tokens.slice( j )) ), - j < len && toSelector( tokens ) - ); - } - matchers.push( matcher ); - } - } - - return elementMatcher( matchers ); -} - -function matcherFromGroupMatchers( elementMatchers, setMatchers ) { - var bySet = setMatchers.length > 0, - byElement = elementMatchers.length > 0, - superMatcher = function( seed, context, xml, results, outermost ) { - var elem, j, matcher, - matchedCount = 0, - i = "0", - unmatched = seed && [], - setMatched = [], - contextBackup = outermostContext, - // We must always have either seed elements or outermost context - elems = seed || byElement && Expr.find["TAG"]( "*", outermost ), - // Use integer dirruns iff this is the outermost matcher - dirrunsUnique = (dirruns += contextBackup == null ? 1 : Math.random() || 0.1), - len = elems.length; - - if ( outermost ) { - outermostContext = context !== document && context; - } - - // Add elements passing elementMatchers directly to results - // Keep `i` a string if there are no elements so `matchedCount` will be "00" below - // Support: IE<9, Safari - // Tolerate NodeList properties (IE: "length"; Safari: ) matching elements by id - for ( ; i !== len && (elem = elems[i]) != null; i++ ) { - if ( byElement && elem ) { - j = 0; - while ( (matcher = elementMatchers[j++]) ) { - if ( matcher( elem, context, xml ) ) { - results.push( elem ); - break; - } - } - if ( outermost ) { - dirruns = dirrunsUnique; - } - } - - // Track unmatched elements for set filters - if ( bySet ) { - // They will have gone through all possible matchers - if ( (elem = !matcher && elem) ) { - matchedCount--; - } - - // Lengthen the array for every element, matched or not - if ( seed ) { - unmatched.push( elem ); - } - } - } - - // Apply set filters to unmatched elements - matchedCount += i; - if ( bySet && i !== matchedCount ) { - j = 0; - while ( (matcher = setMatchers[j++]) ) { - matcher( unmatched, setMatched, context, xml ); - } - - if ( seed ) { - // Reintegrate element matches to eliminate the need for sorting - if ( matchedCount > 0 ) { - while ( i-- ) { - if ( !(unmatched[i] || setMatched[i]) ) { - setMatched[i] = pop.call( results ); - } - } - } - - // Discard index placeholder values to get only actual matches - setMatched = condense( setMatched ); - } - - // Add matches to results - push.apply( results, setMatched ); - - // Seedless set matches succeeding multiple successful matchers stipulate sorting - if ( outermost && !seed && setMatched.length > 0 && - ( matchedCount + setMatchers.length ) > 1 ) { - - Sizzle.uniqueSort( results ); - } - } - - // Override manipulation of globals by nested matchers - if ( outermost ) { - dirruns = dirrunsUnique; - outermostContext = contextBackup; - } - - return unmatched; - }; - - return bySet ? - markFunction( superMatcher ) : - superMatcher; -} - -compile = Sizzle.compile = function( selector, match /* Internal Use Only */ ) { - var i, - setMatchers = [], - elementMatchers = [], - cached = compilerCache[ selector + " " ]; - - if ( !cached ) { - // Generate a function of recursive functions that can be used to check each element - if ( !match ) { - match = tokenize( selector ); - } - i = match.length; - while ( i-- ) { - cached = matcherFromTokens( match[i] ); - if ( cached[ expando ] ) { - setMatchers.push( cached ); - } else { - elementMatchers.push( cached ); - } - } - - // Cache the compiled function - cached = compilerCache( selector, matcherFromGroupMatchers( elementMatchers, setMatchers ) ); - - // Save selector and tokenization - cached.selector = selector; - } - return cached; -}; - -/** - * A low-level selection function that works with Sizzle's compiled - * selector functions - * @param {String|Function} selector A selector or a pre-compiled - * selector function built with Sizzle.compile - * @param {Element} context - * @param {Array} [results] - * @param {Array} [seed] A set of elements to match against - */ -select = Sizzle.select = function( selector, context, results, seed ) { - var i, tokens, token, type, find, - compiled = typeof selector === "function" && selector, - match = !seed && tokenize( (selector = compiled.selector || selector) ); - - results = results || []; - - // Try to minimize operations if there is no seed and only one group - if ( match.length === 1 ) { - - // Take a shortcut and set the context if the root selector is an ID - tokens = match[0] = match[0].slice( 0 ); - if ( tokens.length > 2 && (token = tokens[0]).type === "ID" && - support.getById && context.nodeType === 9 && documentIsHTML && - Expr.relative[ tokens[1].type ] ) { - - context = ( Expr.find["ID"]( token.matches[0].replace(runescape, funescape), context ) || [] )[0]; - if ( !context ) { - return results; - - // Precompiled matchers will still verify ancestry, so step up a level - } else if ( compiled ) { - context = context.parentNode; - } - - selector = selector.slice( tokens.shift().value.length ); - } - - // Fetch a seed set for right-to-left matching - i = matchExpr["needsContext"].test( selector ) ? 0 : tokens.length; - while ( i-- ) { - token = tokens[i]; - - // Abort if we hit a combinator - if ( Expr.relative[ (type = token.type) ] ) { - break; - } - if ( (find = Expr.find[ type ]) ) { - // Search, expanding context for leading sibling combinators - if ( (seed = find( - token.matches[0].replace( runescape, funescape ), - rsibling.test( tokens[0].type ) && testContext( context.parentNode ) || context - )) ) { - - // If seed is empty or no tokens remain, we can return early - tokens.splice( i, 1 ); - selector = seed.length && toSelector( tokens ); - if ( !selector ) { - push.apply( results, seed ); - return results; - } - - break; - } - } - } - } - - // Compile and execute a filtering function if one is not provided - // Provide `match` to avoid retokenization if we modified the selector above - ( compiled || compile( selector, match ) )( - seed, - context, - !documentIsHTML, - results, - rsibling.test( selector ) && testContext( context.parentNode ) || context - ); - return results; -}; - -// One-time assignments - -// Sort stability -support.sortStable = expando.split("").sort( sortOrder ).join("") === expando; - -// Support: Chrome<14 -// Always assume duplicates if they aren't passed to the comparison function -support.detectDuplicates = !!hasDuplicate; - -// Initialize against the default document -setDocument(); - -// Support: Webkit<537.32 - Safari 6.0.3/Chrome 25 (fixed in Chrome 27) -// Detached nodes confoundingly follow *each other* -support.sortDetached = assert(function( div1 ) { - // Should return 1, but returns 4 (following) - return div1.compareDocumentPosition( document.createElement("div") ) & 1; -}); - -// Support: IE<8 -// Prevent attribute/property "interpolation" -// http://msdn.microsoft.com/en-us/library/ms536429%28VS.85%29.aspx -if ( !assert(function( div ) { - div.innerHTML = ""; - return div.firstChild.getAttribute("href") === "#" ; -}) ) { - addHandle( "type|href|height|width", function( elem, name, isXML ) { - if ( !isXML ) { - return elem.getAttribute( name, name.toLowerCase() === "type" ? 1 : 2 ); - } - }); -} - -// Support: IE<9 -// Use defaultValue in place of getAttribute("value") -if ( !support.attributes || !assert(function( div ) { - div.innerHTML = ""; - div.firstChild.setAttribute( "value", "" ); - return div.firstChild.getAttribute( "value" ) === ""; -}) ) { - addHandle( "value", function( elem, name, isXML ) { - if ( !isXML && elem.nodeName.toLowerCase() === "input" ) { - return elem.defaultValue; - } - }); -} - -// Support: IE<9 -// Use getAttributeNode to fetch booleans when getAttribute lies -if ( !assert(function( div ) { - return div.getAttribute("disabled") == null; -}) ) { - addHandle( booleans, function( elem, name, isXML ) { - var val; - if ( !isXML ) { - return elem[ name ] === true ? name.toLowerCase() : - (val = elem.getAttributeNode( name )) && val.specified ? - val.value : - null; - } - }); -} - -return Sizzle; - -})( window ); - - - -jQuery.find = Sizzle; -jQuery.expr = Sizzle.selectors; -jQuery.expr[":"] = jQuery.expr.pseudos; -jQuery.unique = Sizzle.uniqueSort; -jQuery.text = Sizzle.getText; -jQuery.isXMLDoc = Sizzle.isXML; -jQuery.contains = Sizzle.contains; - - - -var rneedsContext = jQuery.expr.match.needsContext; - -var rsingleTag = (/^<(\w+)\s*\/?>(?:<\/\1>|)$/); - - - -var risSimple = /^.[^:#\[\.,]*$/; - -// Implement the identical functionality for filter and not -function winnow( elements, qualifier, not ) { - if ( jQuery.isFunction( qualifier ) ) { - return jQuery.grep( elements, function( elem, i ) { - /* jshint -W018 */ - return !!qualifier.call( elem, i, elem ) !== not; - }); - - } - - if ( qualifier.nodeType ) { - return jQuery.grep( elements, function( elem ) { - return ( elem === qualifier ) !== not; - }); - - } - - if ( typeof qualifier === "string" ) { - if ( risSimple.test( qualifier ) ) { - return jQuery.filter( qualifier, elements, not ); - } - - qualifier = jQuery.filter( qualifier, elements ); - } - - return jQuery.grep( elements, function( elem ) { - return ( jQuery.inArray( elem, qualifier ) >= 0 ) !== not; - }); -} - -jQuery.filter = function( expr, elems, not ) { - var elem = elems[ 0 ]; - - if ( not ) { - expr = ":not(" + expr + ")"; - } - - return elems.length === 1 && elem.nodeType === 1 ? - jQuery.find.matchesSelector( elem, expr ) ? [ elem ] : [] : - jQuery.find.matches( expr, jQuery.grep( elems, function( elem ) { - return elem.nodeType === 1; - })); -}; - -jQuery.fn.extend({ - find: function( selector ) { - var i, - ret = [], - self = this, - len = self.length; - - if ( typeof selector !== "string" ) { - return this.pushStack( jQuery( selector ).filter(function() { - for ( i = 0; i < len; i++ ) { - if ( jQuery.contains( self[ i ], this ) ) { - return true; - } - } - }) ); - } - - for ( i = 0; i < len; i++ ) { - jQuery.find( selector, self[ i ], ret ); - } - - // Needed because $( selector, context ) becomes $( context ).find( selector ) - ret = this.pushStack( len > 1 ? jQuery.unique( ret ) : ret ); - ret.selector = this.selector ? this.selector + " " + selector : selector; - return ret; - }, - filter: function( selector ) { - return this.pushStack( winnow(this, selector || [], false) ); - }, - not: function( selector ) { - return this.pushStack( winnow(this, selector || [], true) ); - }, - is: function( selector ) { - return !!winnow( - this, - - // If this is a positional/relative selector, check membership in the returned set - // so $("p:first").is("p:last") won't return true for a doc with two "p". - typeof selector === "string" && rneedsContext.test( selector ) ? - jQuery( selector ) : - selector || [], - false - ).length; - } -}); - - -// Initialize a jQuery object - - -// A central reference to the root jQuery(document) -var rootjQuery, - - // Use the correct document accordingly with window argument (sandbox) - document = window.document, - - // A simple way to check for HTML strings - // Prioritize #id over to avoid XSS via location.hash (#9521) - // Strict HTML recognition (#11290: must start with <) - rquickExpr = /^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]*))$/, - - init = jQuery.fn.init = function( selector, context ) { - var match, elem; - - // HANDLE: $(""), $(null), $(undefined), $(false) - if ( !selector ) { - return this; - } - - // Handle HTML strings - if ( typeof selector === "string" ) { - if ( selector.charAt(0) === "<" && selector.charAt( selector.length - 1 ) === ">" && selector.length >= 3 ) { - // Assume that strings that start and end with <> are HTML and skip the regex check - match = [ null, selector, null ]; - - } else { - match = rquickExpr.exec( selector ); - } - - // Match html or make sure no context is specified for #id - if ( match && (match[1] || !context) ) { - - // HANDLE: $(html) -> $(array) - if ( match[1] ) { - context = context instanceof jQuery ? context[0] : context; - - // scripts is true for back-compat - // Intentionally let the error be thrown if parseHTML is not present - jQuery.merge( this, jQuery.parseHTML( - match[1], - context && context.nodeType ? context.ownerDocument || context : document, - true - ) ); - - // HANDLE: $(html, props) - if ( rsingleTag.test( match[1] ) && jQuery.isPlainObject( context ) ) { - for ( match in context ) { - // Properties of context are called as methods if possible - if ( jQuery.isFunction( this[ match ] ) ) { - this[ match ]( context[ match ] ); - - // ...and otherwise set as attributes - } else { - this.attr( match, context[ match ] ); - } - } - } - - return this; - - // HANDLE: $(#id) - } else { - elem = document.getElementById( match[2] ); - - // Check parentNode to catch when Blackberry 4.6 returns - // nodes that are no longer in the document #6963 - if ( elem && elem.parentNode ) { - // Handle the case where IE and Opera return items - // by name instead of ID - if ( elem.id !== match[2] ) { - return rootjQuery.find( selector ); - } - - // Otherwise, we inject the element directly into the jQuery object - this.length = 1; - this[0] = elem; - } - - this.context = document; - this.selector = selector; - return this; - } - - // HANDLE: $(expr, $(...)) - } else if ( !context || context.jquery ) { - return ( context || rootjQuery ).find( selector ); - - // HANDLE: $(expr, context) - // (which is just equivalent to: $(context).find(expr) - } else { - return this.constructor( context ).find( selector ); - } - - // HANDLE: $(DOMElement) - } else if ( selector.nodeType ) { - this.context = this[0] = selector; - this.length = 1; - return this; - - // HANDLE: $(function) - // Shortcut for document ready - } else if ( jQuery.isFunction( selector ) ) { - return typeof rootjQuery.ready !== "undefined" ? - rootjQuery.ready( selector ) : - // Execute immediately if ready is not present - selector( jQuery ); - } - - if ( selector.selector !== undefined ) { - this.selector = selector.selector; - this.context = selector.context; - } - - return jQuery.makeArray( selector, this ); - }; - -// Give the init function the jQuery prototype for later instantiation -init.prototype = jQuery.fn; - -// Initialize central reference -rootjQuery = jQuery( document ); - - -var rparentsprev = /^(?:parents|prev(?:Until|All))/, - // methods guaranteed to produce a unique set when starting from a unique set - guaranteedUnique = { - children: true, - contents: true, - next: true, - prev: true - }; - -jQuery.extend({ - dir: function( elem, dir, until ) { - var matched = [], - cur = elem[ dir ]; - - while ( cur && cur.nodeType !== 9 && (until === undefined || cur.nodeType !== 1 || !jQuery( cur ).is( until )) ) { - if ( cur.nodeType === 1 ) { - matched.push( cur ); - } - cur = cur[dir]; - } - return matched; - }, - - sibling: function( n, elem ) { - var r = []; - - for ( ; n; n = n.nextSibling ) { - if ( n.nodeType === 1 && n !== elem ) { - r.push( n ); - } - } - - return r; - } -}); - -jQuery.fn.extend({ - has: function( target ) { - var i, - targets = jQuery( target, this ), - len = targets.length; - - return this.filter(function() { - for ( i = 0; i < len; i++ ) { - if ( jQuery.contains( this, targets[i] ) ) { - return true; - } - } - }); - }, - - closest: function( selectors, context ) { - var cur, - i = 0, - l = this.length, - matched = [], - pos = rneedsContext.test( selectors ) || typeof selectors !== "string" ? - jQuery( selectors, context || this.context ) : - 0; - - for ( ; i < l; i++ ) { - for ( cur = this[i]; cur && cur !== context; cur = cur.parentNode ) { - // Always skip document fragments - if ( cur.nodeType < 11 && (pos ? - pos.index(cur) > -1 : - - // Don't pass non-elements to Sizzle - cur.nodeType === 1 && - jQuery.find.matchesSelector(cur, selectors)) ) { - - matched.push( cur ); - break; - } - } - } - - return this.pushStack( matched.length > 1 ? jQuery.unique( matched ) : matched ); - }, - - // Determine the position of an element within - // the matched set of elements - index: function( elem ) { - - // No argument, return index in parent - if ( !elem ) { - return ( this[0] && this[0].parentNode ) ? this.first().prevAll().length : -1; - } - - // index in selector - if ( typeof elem === "string" ) { - return jQuery.inArray( this[0], jQuery( elem ) ); - } - - // Locate the position of the desired element - return jQuery.inArray( - // If it receives a jQuery object, the first element is used - elem.jquery ? elem[0] : elem, this ); - }, - - add: function( selector, context ) { - return this.pushStack( - jQuery.unique( - jQuery.merge( this.get(), jQuery( selector, context ) ) - ) - ); - }, - - addBack: function( selector ) { - return this.add( selector == null ? - this.prevObject : this.prevObject.filter(selector) - ); - } -}); - -function sibling( cur, dir ) { - do { - cur = cur[ dir ]; - } while ( cur && cur.nodeType !== 1 ); - - return cur; -} - -jQuery.each({ - parent: function( elem ) { - var parent = elem.parentNode; - return parent && parent.nodeType !== 11 ? parent : null; - }, - parents: function( elem ) { - return jQuery.dir( elem, "parentNode" ); - }, - parentsUntil: function( elem, i, until ) { - return jQuery.dir( elem, "parentNode", until ); - }, - next: function( elem ) { - return sibling( elem, "nextSibling" ); - }, - prev: function( elem ) { - return sibling( elem, "previousSibling" ); - }, - nextAll: function( elem ) { - return jQuery.dir( elem, "nextSibling" ); - }, - prevAll: function( elem ) { - return jQuery.dir( elem, "previousSibling" ); - }, - nextUntil: function( elem, i, until ) { - return jQuery.dir( elem, "nextSibling", until ); - }, - prevUntil: function( elem, i, until ) { - return jQuery.dir( elem, "previousSibling", until ); - }, - siblings: function( elem ) { - return jQuery.sibling( ( elem.parentNode || {} ).firstChild, elem ); - }, - children: function( elem ) { - return jQuery.sibling( elem.firstChild ); - }, - contents: function( elem ) { - return jQuery.nodeName( elem, "iframe" ) ? - elem.contentDocument || elem.contentWindow.document : - jQuery.merge( [], elem.childNodes ); - } -}, function( name, fn ) { - jQuery.fn[ name ] = function( until, selector ) { - var ret = jQuery.map( this, fn, until ); - - if ( name.slice( -5 ) !== "Until" ) { - selector = until; - } - - if ( selector && typeof selector === "string" ) { - ret = jQuery.filter( selector, ret ); - } - - if ( this.length > 1 ) { - // Remove duplicates - if ( !guaranteedUnique[ name ] ) { - ret = jQuery.unique( ret ); - } - - // Reverse order for parents* and prev-derivatives - if ( rparentsprev.test( name ) ) { - ret = ret.reverse(); - } - } - - return this.pushStack( ret ); - }; -}); -var rnotwhite = (/\S+/g); - - - -// String to Object options format cache -var optionsCache = {}; - -// Convert String-formatted options into Object-formatted ones and store in cache -function createOptions( options ) { - var object = optionsCache[ options ] = {}; - jQuery.each( options.match( rnotwhite ) || [], function( _, flag ) { - object[ flag ] = true; - }); - return object; -} - -/* - * Create a callback list using the following parameters: - * - * options: an optional list of space-separated options that will change how - * the callback list behaves or a more traditional option object - * - * By default a callback list will act like an event callback list and can be - * "fired" multiple times. - * - * Possible options: - * - * once: will ensure the callback list can only be fired once (like a Deferred) - * - * memory: will keep track of previous values and will call any callback added - * after the list has been fired right away with the latest "memorized" - * values (like a Deferred) - * - * unique: will ensure a callback can only be added once (no duplicate in the list) - * - * stopOnFalse: interrupt callings when a callback returns false - * - */ -jQuery.Callbacks = function( options ) { - - // Convert options from String-formatted to Object-formatted if needed - // (we check in cache first) - options = typeof options === "string" ? - ( optionsCache[ options ] || createOptions( options ) ) : - jQuery.extend( {}, options ); - - var // Flag to know if list is currently firing - firing, - // Last fire value (for non-forgettable lists) - memory, - // Flag to know if list was already fired - fired, - // End of the loop when firing - firingLength, - // Index of currently firing callback (modified by remove if needed) - firingIndex, - // First callback to fire (used internally by add and fireWith) - firingStart, - // Actual callback list - list = [], - // Stack of fire calls for repeatable lists - stack = !options.once && [], - // Fire callbacks - fire = function( data ) { - memory = options.memory && data; - fired = true; - firingIndex = firingStart || 0; - firingStart = 0; - firingLength = list.length; - firing = true; - for ( ; list && firingIndex < firingLength; firingIndex++ ) { - if ( list[ firingIndex ].apply( data[ 0 ], data[ 1 ] ) === false && options.stopOnFalse ) { - memory = false; // To prevent further calls using add - break; - } - } - firing = false; - if ( list ) { - if ( stack ) { - if ( stack.length ) { - fire( stack.shift() ); - } - } else if ( memory ) { - list = []; - } else { - self.disable(); - } - } - }, - // Actual Callbacks object - self = { - // Add a callback or a collection of callbacks to the list - add: function() { - if ( list ) { - // First, we save the current length - var start = list.length; - (function add( args ) { - jQuery.each( args, function( _, arg ) { - var type = jQuery.type( arg ); - if ( type === "function" ) { - if ( !options.unique || !self.has( arg ) ) { - list.push( arg ); - } - } else if ( arg && arg.length && type !== "string" ) { - // Inspect recursively - add( arg ); - } - }); - })( arguments ); - // Do we need to add the callbacks to the - // current firing batch? - if ( firing ) { - firingLength = list.length; - // With memory, if we're not firing then - // we should call right away - } else if ( memory ) { - firingStart = start; - fire( memory ); - } - } - return this; - }, - // Remove a callback from the list - remove: function() { - if ( list ) { - jQuery.each( arguments, function( _, arg ) { - var index; - while ( ( index = jQuery.inArray( arg, list, index ) ) > -1 ) { - list.splice( index, 1 ); - // Handle firing indexes - if ( firing ) { - if ( index <= firingLength ) { - firingLength--; - } - if ( index <= firingIndex ) { - firingIndex--; - } - } - } - }); - } - return this; - }, - // Check if a given callback is in the list. - // If no argument is given, return whether or not list has callbacks attached. - has: function( fn ) { - return fn ? jQuery.inArray( fn, list ) > -1 : !!( list && list.length ); - }, - // Remove all callbacks from the list - empty: function() { - list = []; - firingLength = 0; - return this; - }, - // Have the list do nothing anymore - disable: function() { - list = stack = memory = undefined; - return this; - }, - // Is it disabled? - disabled: function() { - return !list; - }, - // Lock the list in its current state - lock: function() { - stack = undefined; - if ( !memory ) { - self.disable(); - } - return this; - }, - // Is it locked? - locked: function() { - return !stack; - }, - // Call all callbacks with the given context and arguments - fireWith: function( context, args ) { - if ( list && ( !fired || stack ) ) { - args = args || []; - args = [ context, args.slice ? args.slice() : args ]; - if ( firing ) { - stack.push( args ); - } else { - fire( args ); - } - } - return this; - }, - // Call all the callbacks with the given arguments - fire: function() { - self.fireWith( this, arguments ); - return this; - }, - // To know if the callbacks have already been called at least once - fired: function() { - return !!fired; - } - }; - - return self; -}; - - -jQuery.extend({ - - Deferred: function( func ) { - var tuples = [ - // action, add listener, listener list, final state - [ "resolve", "done", jQuery.Callbacks("once memory"), "resolved" ], - [ "reject", "fail", jQuery.Callbacks("once memory"), "rejected" ], - [ "notify", "progress", jQuery.Callbacks("memory") ] - ], - state = "pending", - promise = { - state: function() { - return state; - }, - always: function() { - deferred.done( arguments ).fail( arguments ); - return this; - }, - then: function( /* fnDone, fnFail, fnProgress */ ) { - var fns = arguments; - return jQuery.Deferred(function( newDefer ) { - jQuery.each( tuples, function( i, tuple ) { - var fn = jQuery.isFunction( fns[ i ] ) && fns[ i ]; - // deferred[ done | fail | progress ] for forwarding actions to newDefer - deferred[ tuple[1] ](function() { - var returned = fn && fn.apply( this, arguments ); - if ( returned && jQuery.isFunction( returned.promise ) ) { - returned.promise() - .done( newDefer.resolve ) - .fail( newDefer.reject ) - .progress( newDefer.notify ); - } else { - newDefer[ tuple[ 0 ] + "With" ]( this === promise ? newDefer.promise() : this, fn ? [ returned ] : arguments ); - } - }); - }); - fns = null; - }).promise(); - }, - // Get a promise for this deferred - // If obj is provided, the promise aspect is added to the object - promise: function( obj ) { - return obj != null ? jQuery.extend( obj, promise ) : promise; - } - }, - deferred = {}; - - // Keep pipe for back-compat - promise.pipe = promise.then; - - // Add list-specific methods - jQuery.each( tuples, function( i, tuple ) { - var list = tuple[ 2 ], - stateString = tuple[ 3 ]; - - // promise[ done | fail | progress ] = list.add - promise[ tuple[1] ] = list.add; - - // Handle state - if ( stateString ) { - list.add(function() { - // state = [ resolved | rejected ] - state = stateString; - - // [ reject_list | resolve_list ].disable; progress_list.lock - }, tuples[ i ^ 1 ][ 2 ].disable, tuples[ 2 ][ 2 ].lock ); - } - - // deferred[ resolve | reject | notify ] - deferred[ tuple[0] ] = function() { - deferred[ tuple[0] + "With" ]( this === deferred ? promise : this, arguments ); - return this; - }; - deferred[ tuple[0] + "With" ] = list.fireWith; - }); - - // Make the deferred a promise - promise.promise( deferred ); - - // Call given func if any - if ( func ) { - func.call( deferred, deferred ); - } - - // All done! - return deferred; - }, - - // Deferred helper - when: function( subordinate /* , ..., subordinateN */ ) { - var i = 0, - resolveValues = slice.call( arguments ), - length = resolveValues.length, - - // the count of uncompleted subordinates - remaining = length !== 1 || ( subordinate && jQuery.isFunction( subordinate.promise ) ) ? length : 0, - - // the master Deferred. If resolveValues consist of only a single Deferred, just use that. - deferred = remaining === 1 ? subordinate : jQuery.Deferred(), - - // Update function for both resolve and progress values - updateFunc = function( i, contexts, values ) { - return function( value ) { - contexts[ i ] = this; - values[ i ] = arguments.length > 1 ? slice.call( arguments ) : value; - if ( values === progressValues ) { - deferred.notifyWith( contexts, values ); - - } else if ( !(--remaining) ) { - deferred.resolveWith( contexts, values ); - } - }; - }, - - progressValues, progressContexts, resolveContexts; - - // add listeners to Deferred subordinates; treat others as resolved - if ( length > 1 ) { - progressValues = new Array( length ); - progressContexts = new Array( length ); - resolveContexts = new Array( length ); - for ( ; i < length; i++ ) { - if ( resolveValues[ i ] && jQuery.isFunction( resolveValues[ i ].promise ) ) { - resolveValues[ i ].promise() - .done( updateFunc( i, resolveContexts, resolveValues ) ) - .fail( deferred.reject ) - .progress( updateFunc( i, progressContexts, progressValues ) ); - } else { - --remaining; - } - } - } - - // if we're not waiting on anything, resolve the master - if ( !remaining ) { - deferred.resolveWith( resolveContexts, resolveValues ); - } - - return deferred.promise(); - } -}); - - -// The deferred used on DOM ready -var readyList; - -jQuery.fn.ready = function( fn ) { - // Add the callback - jQuery.ready.promise().done( fn ); - - return this; -}; - -jQuery.extend({ - // Is the DOM ready to be used? Set to true once it occurs. - isReady: false, - - // A counter to track how many items to wait for before - // the ready event fires. See #6781 - readyWait: 1, - - // Hold (or release) the ready event - holdReady: function( hold ) { - if ( hold ) { - jQuery.readyWait++; - } else { - jQuery.ready( true ); - } - }, - - // Handle when the DOM is ready - ready: function( wait ) { - - // Abort if there are pending holds or we're already ready - if ( wait === true ? --jQuery.readyWait : jQuery.isReady ) { - return; - } - - // Make sure body exists, at least, in case IE gets a little overzealous (ticket #5443). - if ( !document.body ) { - return setTimeout( jQuery.ready ); - } - - // Remember that the DOM is ready - jQuery.isReady = true; - - // If a normal DOM Ready event fired, decrement, and wait if need be - if ( wait !== true && --jQuery.readyWait > 0 ) { - return; - } - - // If there are functions bound, to execute - readyList.resolveWith( document, [ jQuery ] ); - - // Trigger any bound ready events - if ( jQuery.fn.triggerHandler ) { - jQuery( document ).triggerHandler( "ready" ); - jQuery( document ).off( "ready" ); - } - } -}); - -/** - * Clean-up method for dom ready events - */ -function detach() { - if ( document.addEventListener ) { - document.removeEventListener( "DOMContentLoaded", completed, false ); - window.removeEventListener( "load", completed, false ); - - } else { - document.detachEvent( "onreadystatechange", completed ); - window.detachEvent( "onload", completed ); - } -} - -/** - * The ready event handler and self cleanup method - */ -function completed() { - // readyState === "complete" is good enough for us to call the dom ready in oldIE - if ( document.addEventListener || event.type === "load" || document.readyState === "complete" ) { - detach(); - jQuery.ready(); - } -} - -jQuery.ready.promise = function( obj ) { - if ( !readyList ) { - - readyList = jQuery.Deferred(); - - // Catch cases where $(document).ready() is called after the browser event has already occurred. - // we once tried to use readyState "interactive" here, but it caused issues like the one - // discovered by ChrisS here: http://bugs.jquery.com/ticket/12282#comment:15 - if ( document.readyState === "complete" ) { - // Handle it asynchronously to allow scripts the opportunity to delay ready - setTimeout( jQuery.ready ); - - // Standards-based browsers support DOMContentLoaded - } else if ( document.addEventListener ) { - // Use the handy event callback - document.addEventListener( "DOMContentLoaded", completed, false ); - - // A fallback to window.onload, that will always work - window.addEventListener( "load", completed, false ); - - // If IE event model is used - } else { - // Ensure firing before onload, maybe late but safe also for iframes - document.attachEvent( "onreadystatechange", completed ); - - // A fallback to window.onload, that will always work - window.attachEvent( "onload", completed ); - - // If IE and not a frame - // continually check to see if the document is ready - var top = false; - - try { - top = window.frameElement == null && document.documentElement; - } catch(e) {} - - if ( top && top.doScroll ) { - (function doScrollCheck() { - if ( !jQuery.isReady ) { - - try { - // Use the trick by Diego Perini - // http://javascript.nwbox.com/IEContentLoaded/ - top.doScroll("left"); - } catch(e) { - return setTimeout( doScrollCheck, 50 ); - } - - // detach all dom ready events - detach(); - - // and execute any waiting functions - jQuery.ready(); - } - })(); - } - } - } - return readyList.promise( obj ); -}; - - -var strundefined = typeof undefined; - - - -// Support: IE<9 -// Iteration over object's inherited properties before its own -var i; -for ( i in jQuery( support ) ) { - break; -} -support.ownLast = i !== "0"; - -// Note: most support tests are defined in their respective modules. -// false until the test is run -support.inlineBlockNeedsLayout = false; - -// Execute ASAP in case we need to set body.style.zoom -jQuery(function() { - // Minified: var a,b,c,d - var val, div, body, container; - - body = document.getElementsByTagName( "body" )[ 0 ]; - if ( !body || !body.style ) { - // Return for frameset docs that don't have a body - return; - } - - // Setup - div = document.createElement( "div" ); - container = document.createElement( "div" ); - container.style.cssText = "position:absolute;border:0;width:0;height:0;top:0;left:-9999px"; - body.appendChild( container ).appendChild( div ); - - if ( typeof div.style.zoom !== strundefined ) { - // Support: IE<8 - // Check if natively block-level elements act like inline-block - // elements when setting their display to 'inline' and giving - // them layout - div.style.cssText = "display:inline;margin:0;border:0;padding:1px;width:1px;zoom:1"; - - support.inlineBlockNeedsLayout = val = div.offsetWidth === 3; - if ( val ) { - // Prevent IE 6 from affecting layout for positioned elements #11048 - // Prevent IE from shrinking the body in IE 7 mode #12869 - // Support: IE<8 - body.style.zoom = 1; - } - } - - body.removeChild( container ); -}); - - - - -(function() { - var div = document.createElement( "div" ); - - // Execute the test only if not already executed in another module. - if (support.deleteExpando == null) { - // Support: IE<9 - support.deleteExpando = true; - try { - delete div.test; - } catch( e ) { - support.deleteExpando = false; - } - } - - // Null elements to avoid leaks in IE. - div = null; -})(); - - -/** - * Determines whether an object can have data - */ -jQuery.acceptData = function( elem ) { - var noData = jQuery.noData[ (elem.nodeName + " ").toLowerCase() ], - nodeType = +elem.nodeType || 1; - - // Do not set data on non-element DOM nodes because it will not be cleared (#8335). - return nodeType !== 1 && nodeType !== 9 ? - false : - - // Nodes accept data unless otherwise specified; rejection can be conditional - !noData || noData !== true && elem.getAttribute("classid") === noData; -}; - - -var rbrace = /^(?:\{[\w\W]*\}|\[[\w\W]*\])$/, - rmultiDash = /([A-Z])/g; - -function dataAttr( elem, key, data ) { - // If nothing was found internally, try to fetch any - // data from the HTML5 data-* attribute - if ( data === undefined && elem.nodeType === 1 ) { - - var name = "data-" + key.replace( rmultiDash, "-$1" ).toLowerCase(); - - data = elem.getAttribute( name ); - - if ( typeof data === "string" ) { - try { - data = data === "true" ? true : - data === "false" ? false : - data === "null" ? null : - // Only convert to a number if it doesn't change the string - +data + "" === data ? +data : - rbrace.test( data ) ? jQuery.parseJSON( data ) : - data; - } catch( e ) {} - - // Make sure we set the data so it isn't changed later - jQuery.data( elem, key, data ); - - } else { - data = undefined; - } - } - - return data; -} - -// checks a cache object for emptiness -function isEmptyDataObject( obj ) { - var name; - for ( name in obj ) { - - // if the public data object is empty, the private is still empty - if ( name === "data" && jQuery.isEmptyObject( obj[name] ) ) { - continue; - } - if ( name !== "toJSON" ) { - return false; - } - } - - return true; -} - -function internalData( elem, name, data, pvt /* Internal Use Only */ ) { - if ( !jQuery.acceptData( elem ) ) { - return; - } - - var ret, thisCache, - internalKey = jQuery.expando, - - // We have to handle DOM nodes and JS objects differently because IE6-7 - // can't GC object references properly across the DOM-JS boundary - isNode = elem.nodeType, - - // Only DOM nodes need the global jQuery cache; JS object data is - // attached directly to the object so GC can occur automatically - cache = isNode ? jQuery.cache : elem, - - // Only defining an ID for JS objects if its cache already exists allows - // the code to shortcut on the same path as a DOM node with no cache - id = isNode ? elem[ internalKey ] : elem[ internalKey ] && internalKey; - - // Avoid doing any more work than we need to when trying to get data on an - // object that has no data at all - if ( (!id || !cache[id] || (!pvt && !cache[id].data)) && data === undefined && typeof name === "string" ) { - return; - } - - if ( !id ) { - // Only DOM nodes need a new unique ID for each element since their data - // ends up in the global cache - if ( isNode ) { - id = elem[ internalKey ] = deletedIds.pop() || jQuery.guid++; - } else { - id = internalKey; - } - } - - if ( !cache[ id ] ) { - // Avoid exposing jQuery metadata on plain JS objects when the object - // is serialized using JSON.stringify - cache[ id ] = isNode ? {} : { toJSON: jQuery.noop }; - } - - // An object can be passed to jQuery.data instead of a key/value pair; this gets - // shallow copied over onto the existing cache - if ( typeof name === "object" || typeof name === "function" ) { - if ( pvt ) { - cache[ id ] = jQuery.extend( cache[ id ], name ); - } else { - cache[ id ].data = jQuery.extend( cache[ id ].data, name ); - } - } - - thisCache = cache[ id ]; - - // jQuery data() is stored in a separate object inside the object's internal data - // cache in order to avoid key collisions between internal data and user-defined - // data. - if ( !pvt ) { - if ( !thisCache.data ) { - thisCache.data = {}; - } - - thisCache = thisCache.data; - } - - if ( data !== undefined ) { - thisCache[ jQuery.camelCase( name ) ] = data; - } - - // Check for both converted-to-camel and non-converted data property names - // If a data property was specified - if ( typeof name === "string" ) { - - // First Try to find as-is property data - ret = thisCache[ name ]; - - // Test for null|undefined property data - if ( ret == null ) { - - // Try to find the camelCased property - ret = thisCache[ jQuery.camelCase( name ) ]; - } - } else { - ret = thisCache; - } - - return ret; -} - -function internalRemoveData( elem, name, pvt ) { - if ( !jQuery.acceptData( elem ) ) { - return; - } - - var thisCache, i, - isNode = elem.nodeType, - - // See jQuery.data for more information - cache = isNode ? jQuery.cache : elem, - id = isNode ? elem[ jQuery.expando ] : jQuery.expando; - - // If there is already no cache entry for this object, there is no - // purpose in continuing - if ( !cache[ id ] ) { - return; - } - - if ( name ) { - - thisCache = pvt ? cache[ id ] : cache[ id ].data; - - if ( thisCache ) { - - // Support array or space separated string names for data keys - if ( !jQuery.isArray( name ) ) { - - // try the string as a key before any manipulation - if ( name in thisCache ) { - name = [ name ]; - } else { - - // split the camel cased version by spaces unless a key with the spaces exists - name = jQuery.camelCase( name ); - if ( name in thisCache ) { - name = [ name ]; - } else { - name = name.split(" "); - } - } - } else { - // If "name" is an array of keys... - // When data is initially created, via ("key", "val") signature, - // keys will be converted to camelCase. - // Since there is no way to tell _how_ a key was added, remove - // both plain key and camelCase key. #12786 - // This will only penalize the array argument path. - name = name.concat( jQuery.map( name, jQuery.camelCase ) ); - } - - i = name.length; - while ( i-- ) { - delete thisCache[ name[i] ]; - } - - // If there is no data left in the cache, we want to continue - // and let the cache object itself get destroyed - if ( pvt ? !isEmptyDataObject(thisCache) : !jQuery.isEmptyObject(thisCache) ) { - return; - } - } - } - - // See jQuery.data for more information - if ( !pvt ) { - delete cache[ id ].data; - - // Don't destroy the parent cache unless the internal data object - // had been the only thing left in it - if ( !isEmptyDataObject( cache[ id ] ) ) { - return; - } - } - - // Destroy the cache - if ( isNode ) { - jQuery.cleanData( [ elem ], true ); - - // Use delete when supported for expandos or `cache` is not a window per isWindow (#10080) - /* jshint eqeqeq: false */ - } else if ( support.deleteExpando || cache != cache.window ) { - /* jshint eqeqeq: true */ - delete cache[ id ]; - - // When all else fails, null - } else { - cache[ id ] = null; - } -} - -jQuery.extend({ - cache: {}, - - // The following elements (space-suffixed to avoid Object.prototype collisions) - // throw uncatchable exceptions if you attempt to set expando properties - noData: { - "applet ": true, - "embed ": true, - // ...but Flash objects (which have this classid) *can* handle expandos - "object ": "clsid:D27CDB6E-AE6D-11cf-96B8-444553540000" - }, - - hasData: function( elem ) { - elem = elem.nodeType ? jQuery.cache[ elem[jQuery.expando] ] : elem[ jQuery.expando ]; - return !!elem && !isEmptyDataObject( elem ); - }, - - data: function( elem, name, data ) { - return internalData( elem, name, data ); - }, - - removeData: function( elem, name ) { - return internalRemoveData( elem, name ); - }, - - // For internal use only. - _data: function( elem, name, data ) { - return internalData( elem, name, data, true ); - }, - - _removeData: function( elem, name ) { - return internalRemoveData( elem, name, true ); - } -}); - -jQuery.fn.extend({ - data: function( key, value ) { - var i, name, data, - elem = this[0], - attrs = elem && elem.attributes; - - // Special expections of .data basically thwart jQuery.access, - // so implement the relevant behavior ourselves - - // Gets all values - if ( key === undefined ) { - if ( this.length ) { - data = jQuery.data( elem ); - - if ( elem.nodeType === 1 && !jQuery._data( elem, "parsedAttrs" ) ) { - i = attrs.length; - while ( i-- ) { - - // Support: IE11+ - // The attrs elements can be null (#14894) - if ( attrs[ i ] ) { - name = attrs[ i ].name; - if ( name.indexOf( "data-" ) === 0 ) { - name = jQuery.camelCase( name.slice(5) ); - dataAttr( elem, name, data[ name ] ); - } - } - } - jQuery._data( elem, "parsedAttrs", true ); - } - } - - return data; - } - - // Sets multiple values - if ( typeof key === "object" ) { - return this.each(function() { - jQuery.data( this, key ); - }); - } - - return arguments.length > 1 ? - - // Sets one value - this.each(function() { - jQuery.data( this, key, value ); - }) : - - // Gets one value - // Try to fetch any internally stored data first - elem ? dataAttr( elem, key, jQuery.data( elem, key ) ) : undefined; - }, - - removeData: function( key ) { - return this.each(function() { - jQuery.removeData( this, key ); - }); - } -}); - - -jQuery.extend({ - queue: function( elem, type, data ) { - var queue; - - if ( elem ) { - type = ( type || "fx" ) + "queue"; - queue = jQuery._data( elem, type ); - - // Speed up dequeue by getting out quickly if this is just a lookup - if ( data ) { - if ( !queue || jQuery.isArray(data) ) { - queue = jQuery._data( elem, type, jQuery.makeArray(data) ); - } else { - queue.push( data ); - } - } - return queue || []; - } - }, - - dequeue: function( elem, type ) { - type = type || "fx"; - - var queue = jQuery.queue( elem, type ), - startLength = queue.length, - fn = queue.shift(), - hooks = jQuery._queueHooks( elem, type ), - next = function() { - jQuery.dequeue( elem, type ); - }; - - // If the fx queue is dequeued, always remove the progress sentinel - if ( fn === "inprogress" ) { - fn = queue.shift(); - startLength--; - } - - if ( fn ) { - - // Add a progress sentinel to prevent the fx queue from being - // automatically dequeued - if ( type === "fx" ) { - queue.unshift( "inprogress" ); - } - - // clear up the last queue stop function - delete hooks.stop; - fn.call( elem, next, hooks ); - } - - if ( !startLength && hooks ) { - hooks.empty.fire(); - } - }, - - // not intended for public consumption - generates a queueHooks object, or returns the current one - _queueHooks: function( elem, type ) { - var key = type + "queueHooks"; - return jQuery._data( elem, key ) || jQuery._data( elem, key, { - empty: jQuery.Callbacks("once memory").add(function() { - jQuery._removeData( elem, type + "queue" ); - jQuery._removeData( elem, key ); - }) - }); - } -}); - -jQuery.fn.extend({ - queue: function( type, data ) { - var setter = 2; - - if ( typeof type !== "string" ) { - data = type; - type = "fx"; - setter--; - } - - if ( arguments.length < setter ) { - return jQuery.queue( this[0], type ); - } - - return data === undefined ? - this : - this.each(function() { - var queue = jQuery.queue( this, type, data ); - - // ensure a hooks for this queue - jQuery._queueHooks( this, type ); - - if ( type === "fx" && queue[0] !== "inprogress" ) { - jQuery.dequeue( this, type ); - } - }); - }, - dequeue: function( type ) { - return this.each(function() { - jQuery.dequeue( this, type ); - }); - }, - clearQueue: function( type ) { - return this.queue( type || "fx", [] ); - }, - // Get a promise resolved when queues of a certain type - // are emptied (fx is the type by default) - promise: function( type, obj ) { - var tmp, - count = 1, - defer = jQuery.Deferred(), - elements = this, - i = this.length, - resolve = function() { - if ( !( --count ) ) { - defer.resolveWith( elements, [ elements ] ); - } - }; - - if ( typeof type !== "string" ) { - obj = type; - type = undefined; - } - type = type || "fx"; - - while ( i-- ) { - tmp = jQuery._data( elements[ i ], type + "queueHooks" ); - if ( tmp && tmp.empty ) { - count++; - tmp.empty.add( resolve ); - } - } - resolve(); - return defer.promise( obj ); - } -}); -var pnum = (/[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/).source; - -var cssExpand = [ "Top", "Right", "Bottom", "Left" ]; - -var isHidden = function( elem, el ) { - // isHidden might be called from jQuery#filter function; - // in that case, element will be second argument - elem = el || elem; - return jQuery.css( elem, "display" ) === "none" || !jQuery.contains( elem.ownerDocument, elem ); - }; - - - -// Multifunctional method to get and set values of a collection -// The value/s can optionally be executed if it's a function -var access = jQuery.access = function( elems, fn, key, value, chainable, emptyGet, raw ) { - var i = 0, - length = elems.length, - bulk = key == null; - - // Sets many values - if ( jQuery.type( key ) === "object" ) { - chainable = true; - for ( i in key ) { - jQuery.access( elems, fn, i, key[i], true, emptyGet, raw ); - } - - // Sets one value - } else if ( value !== undefined ) { - chainable = true; - - if ( !jQuery.isFunction( value ) ) { - raw = true; - } - - if ( bulk ) { - // Bulk operations run against the entire set - if ( raw ) { - fn.call( elems, value ); - fn = null; - - // ...except when executing function values - } else { - bulk = fn; - fn = function( elem, key, value ) { - return bulk.call( jQuery( elem ), value ); - }; - } - } - - if ( fn ) { - for ( ; i < length; i++ ) { - fn( elems[i], key, raw ? value : value.call( elems[i], i, fn( elems[i], key ) ) ); - } - } - } - - return chainable ? - elems : - - // Gets - bulk ? - fn.call( elems ) : - length ? fn( elems[0], key ) : emptyGet; -}; -var rcheckableType = (/^(?:checkbox|radio)$/i); - - - -(function() { - // Minified: var a,b,c - var input = document.createElement( "input" ), - div = document.createElement( "div" ), - fragment = document.createDocumentFragment(); - - // Setup - div.innerHTML = "
a"; - - // IE strips leading whitespace when .innerHTML is used - support.leadingWhitespace = div.firstChild.nodeType === 3; - - // Make sure that tbody elements aren't automatically inserted - // IE will insert them into empty tables - support.tbody = !div.getElementsByTagName( "tbody" ).length; - - // Make sure that link elements get serialized correctly by innerHTML - // This requires a wrapper element in IE - support.htmlSerialize = !!div.getElementsByTagName( "link" ).length; - - // Makes sure cloning an html5 element does not cause problems - // Where outerHTML is undefined, this still works - support.html5Clone = - document.createElement( "nav" ).cloneNode( true ).outerHTML !== "<:nav>"; - - // Check if a disconnected checkbox will retain its checked - // value of true after appended to the DOM (IE6/7) - input.type = "checkbox"; - input.checked = true; - fragment.appendChild( input ); - support.appendChecked = input.checked; - - // Make sure textarea (and checkbox) defaultValue is properly cloned - // Support: IE6-IE11+ - div.innerHTML = ""; - support.noCloneChecked = !!div.cloneNode( true ).lastChild.defaultValue; - - // #11217 - WebKit loses check when the name is after the checked attribute - fragment.appendChild( div ); - div.innerHTML = ""; - - // Support: Safari 5.1, iOS 5.1, Android 4.x, Android 2.3 - // old WebKit doesn't clone checked state correctly in fragments - support.checkClone = div.cloneNode( true ).cloneNode( true ).lastChild.checked; - - // Support: IE<9 - // Opera does not clone events (and typeof div.attachEvent === undefined). - // IE9-10 clones events bound via attachEvent, but they don't trigger with .click() - support.noCloneEvent = true; - if ( div.attachEvent ) { - div.attachEvent( "onclick", function() { - support.noCloneEvent = false; - }); - - div.cloneNode( true ).click(); - } - - // Execute the test only if not already executed in another module. - if (support.deleteExpando == null) { - // Support: IE<9 - support.deleteExpando = true; - try { - delete div.test; - } catch( e ) { - support.deleteExpando = false; - } - } -})(); - - -(function() { - var i, eventName, - div = document.createElement( "div" ); - - // Support: IE<9 (lack submit/change bubble), Firefox 23+ (lack focusin event) - for ( i in { submit: true, change: true, focusin: true }) { - eventName = "on" + i; - - if ( !(support[ i + "Bubbles" ] = eventName in window) ) { - // Beware of CSP restrictions (https://developer.mozilla.org/en/Security/CSP) - div.setAttribute( eventName, "t" ); - support[ i + "Bubbles" ] = div.attributes[ eventName ].expando === false; - } - } - - // Null elements to avoid leaks in IE. - div = null; -})(); - - -var rformElems = /^(?:input|select|textarea)$/i, - rkeyEvent = /^key/, - rmouseEvent = /^(?:mouse|pointer|contextmenu)|click/, - rfocusMorph = /^(?:focusinfocus|focusoutblur)$/, - rtypenamespace = /^([^.]*)(?:\.(.+)|)$/; - -function returnTrue() { - return true; -} - -function returnFalse() { - return false; -} - -function safeActiveElement() { - try { - return document.activeElement; - } catch ( err ) { } -} - -/* - * Helper functions for managing events -- not part of the public interface. - * Props to Dean Edwards' addEvent library for many of the ideas. - */ -jQuery.event = { - - global: {}, - - add: function( elem, types, handler, data, selector ) { - var tmp, events, t, handleObjIn, - special, eventHandle, handleObj, - handlers, type, namespaces, origType, - elemData = jQuery._data( elem ); - - // Don't attach events to noData or text/comment nodes (but allow plain objects) - if ( !elemData ) { - return; - } - - // Caller can pass in an object of custom data in lieu of the handler - if ( handler.handler ) { - handleObjIn = handler; - handler = handleObjIn.handler; - selector = handleObjIn.selector; - } - - // Make sure that the handler has a unique ID, used to find/remove it later - if ( !handler.guid ) { - handler.guid = jQuery.guid++; - } - - // Init the element's event structure and main handler, if this is the first - if ( !(events = elemData.events) ) { - events = elemData.events = {}; - } - if ( !(eventHandle = elemData.handle) ) { - eventHandle = elemData.handle = function( e ) { - // Discard the second event of a jQuery.event.trigger() and - // when an event is called after a page has unloaded - return typeof jQuery !== strundefined && (!e || jQuery.event.triggered !== e.type) ? - jQuery.event.dispatch.apply( eventHandle.elem, arguments ) : - undefined; - }; - // Add elem as a property of the handle fn to prevent a memory leak with IE non-native events - eventHandle.elem = elem; - } - - // Handle multiple events separated by a space - types = ( types || "" ).match( rnotwhite ) || [ "" ]; - t = types.length; - while ( t-- ) { - tmp = rtypenamespace.exec( types[t] ) || []; - type = origType = tmp[1]; - namespaces = ( tmp[2] || "" ).split( "." ).sort(); - - // There *must* be a type, no attaching namespace-only handlers - if ( !type ) { - continue; - } - - // If event changes its type, use the special event handlers for the changed type - special = jQuery.event.special[ type ] || {}; - - // If selector defined, determine special event api type, otherwise given type - type = ( selector ? special.delegateType : special.bindType ) || type; - - // Update special based on newly reset type - special = jQuery.event.special[ type ] || {}; - - // handleObj is passed to all event handlers - handleObj = jQuery.extend({ - type: type, - origType: origType, - data: data, - handler: handler, - guid: handler.guid, - selector: selector, - needsContext: selector && jQuery.expr.match.needsContext.test( selector ), - namespace: namespaces.join(".") - }, handleObjIn ); - - // Init the event handler queue if we're the first - if ( !(handlers = events[ type ]) ) { - handlers = events[ type ] = []; - handlers.delegateCount = 0; - - // Only use addEventListener/attachEvent if the special events handler returns false - if ( !special.setup || special.setup.call( elem, data, namespaces, eventHandle ) === false ) { - // Bind the global event handler to the element - if ( elem.addEventListener ) { - elem.addEventListener( type, eventHandle, false ); - - } else if ( elem.attachEvent ) { - elem.attachEvent( "on" + type, eventHandle ); - } - } - } - - if ( special.add ) { - special.add.call( elem, handleObj ); - - if ( !handleObj.handler.guid ) { - handleObj.handler.guid = handler.guid; - } - } - - // Add to the element's handler list, delegates in front - if ( selector ) { - handlers.splice( handlers.delegateCount++, 0, handleObj ); - } else { - handlers.push( handleObj ); - } - - // Keep track of which events have ever been used, for event optimization - jQuery.event.global[ type ] = true; - } - - // Nullify elem to prevent memory leaks in IE - elem = null; - }, - - // Detach an event or set of events from an element - remove: function( elem, types, handler, selector, mappedTypes ) { - var j, handleObj, tmp, - origCount, t, events, - special, handlers, type, - namespaces, origType, - elemData = jQuery.hasData( elem ) && jQuery._data( elem ); - - if ( !elemData || !(events = elemData.events) ) { - return; - } - - // Once for each type.namespace in types; type may be omitted - types = ( types || "" ).match( rnotwhite ) || [ "" ]; - t = types.length; - while ( t-- ) { - tmp = rtypenamespace.exec( types[t] ) || []; - type = origType = tmp[1]; - namespaces = ( tmp[2] || "" ).split( "." ).sort(); - - // Unbind all events (on this namespace, if provided) for the element - if ( !type ) { - for ( type in events ) { - jQuery.event.remove( elem, type + types[ t ], handler, selector, true ); - } - continue; - } - - special = jQuery.event.special[ type ] || {}; - type = ( selector ? special.delegateType : special.bindType ) || type; - handlers = events[ type ] || []; - tmp = tmp[2] && new RegExp( "(^|\\.)" + namespaces.join("\\.(?:.*\\.|)") + "(\\.|$)" ); - - // Remove matching events - origCount = j = handlers.length; - while ( j-- ) { - handleObj = handlers[ j ]; - - if ( ( mappedTypes || origType === handleObj.origType ) && - ( !handler || handler.guid === handleObj.guid ) && - ( !tmp || tmp.test( handleObj.namespace ) ) && - ( !selector || selector === handleObj.selector || selector === "**" && handleObj.selector ) ) { - handlers.splice( j, 1 ); - - if ( handleObj.selector ) { - handlers.delegateCount--; - } - if ( special.remove ) { - special.remove.call( elem, handleObj ); - } - } - } - - // Remove generic event handler if we removed something and no more handlers exist - // (avoids potential for endless recursion during removal of special event handlers) - if ( origCount && !handlers.length ) { - if ( !special.teardown || special.teardown.call( elem, namespaces, elemData.handle ) === false ) { - jQuery.removeEvent( elem, type, elemData.handle ); - } - - delete events[ type ]; - } - } - - // Remove the expando if it's no longer used - if ( jQuery.isEmptyObject( events ) ) { - delete elemData.handle; - - // removeData also checks for emptiness and clears the expando if empty - // so use it instead of delete - jQuery._removeData( elem, "events" ); - } - }, - - trigger: function( event, data, elem, onlyHandlers ) { - var handle, ontype, cur, - bubbleType, special, tmp, i, - eventPath = [ elem || document ], - type = hasOwn.call( event, "type" ) ? event.type : event, - namespaces = hasOwn.call( event, "namespace" ) ? event.namespace.split(".") : []; - - cur = tmp = elem = elem || document; - - // Don't do events on text and comment nodes - if ( elem.nodeType === 3 || elem.nodeType === 8 ) { - return; - } - - // focus/blur morphs to focusin/out; ensure we're not firing them right now - if ( rfocusMorph.test( type + jQuery.event.triggered ) ) { - return; - } - - if ( type.indexOf(".") >= 0 ) { - // Namespaced trigger; create a regexp to match event type in handle() - namespaces = type.split("."); - type = namespaces.shift(); - namespaces.sort(); - } - ontype = type.indexOf(":") < 0 && "on" + type; - - // Caller can pass in a jQuery.Event object, Object, or just an event type string - event = event[ jQuery.expando ] ? - event : - new jQuery.Event( type, typeof event === "object" && event ); - - // Trigger bitmask: & 1 for native handlers; & 2 for jQuery (always true) - event.isTrigger = onlyHandlers ? 2 : 3; - event.namespace = namespaces.join("."); - event.namespace_re = event.namespace ? - new RegExp( "(^|\\.)" + namespaces.join("\\.(?:.*\\.|)") + "(\\.|$)" ) : - null; - - // Clean up the event in case it is being reused - event.result = undefined; - if ( !event.target ) { - event.target = elem; - } - - // Clone any incoming data and prepend the event, creating the handler arg list - data = data == null ? - [ event ] : - jQuery.makeArray( data, [ event ] ); - - // Allow special events to draw outside the lines - special = jQuery.event.special[ type ] || {}; - if ( !onlyHandlers && special.trigger && special.trigger.apply( elem, data ) === false ) { - return; - } - - // Determine event propagation path in advance, per W3C events spec (#9951) - // Bubble up to document, then to window; watch for a global ownerDocument var (#9724) - if ( !onlyHandlers && !special.noBubble && !jQuery.isWindow( elem ) ) { - - bubbleType = special.delegateType || type; - if ( !rfocusMorph.test( bubbleType + type ) ) { - cur = cur.parentNode; - } - for ( ; cur; cur = cur.parentNode ) { - eventPath.push( cur ); - tmp = cur; - } - - // Only add window if we got to document (e.g., not plain obj or detached DOM) - if ( tmp === (elem.ownerDocument || document) ) { - eventPath.push( tmp.defaultView || tmp.parentWindow || window ); - } - } - - // Fire handlers on the event path - i = 0; - while ( (cur = eventPath[i++]) && !event.isPropagationStopped() ) { - - event.type = i > 1 ? - bubbleType : - special.bindType || type; - - // jQuery handler - handle = ( jQuery._data( cur, "events" ) || {} )[ event.type ] && jQuery._data( cur, "handle" ); - if ( handle ) { - handle.apply( cur, data ); - } - - // Native handler - handle = ontype && cur[ ontype ]; - if ( handle && handle.apply && jQuery.acceptData( cur ) ) { - event.result = handle.apply( cur, data ); - if ( event.result === false ) { - event.preventDefault(); - } - } - } - event.type = type; - - // If nobody prevented the default action, do it now - if ( !onlyHandlers && !event.isDefaultPrevented() ) { - - if ( (!special._default || special._default.apply( eventPath.pop(), data ) === false) && - jQuery.acceptData( elem ) ) { - - // Call a native DOM method on the target with the same name name as the event. - // Can't use an .isFunction() check here because IE6/7 fails that test. - // Don't do default actions on window, that's where global variables be (#6170) - if ( ontype && elem[ type ] && !jQuery.isWindow( elem ) ) { - - // Don't re-trigger an onFOO event when we call its FOO() method - tmp = elem[ ontype ]; - - if ( tmp ) { - elem[ ontype ] = null; - } - - // Prevent re-triggering of the same event, since we already bubbled it above - jQuery.event.triggered = type; - try { - elem[ type ](); - } catch ( e ) { - // IE<9 dies on focus/blur to hidden element (#1486,#12518) - // only reproducible on winXP IE8 native, not IE9 in IE8 mode - } - jQuery.event.triggered = undefined; - - if ( tmp ) { - elem[ ontype ] = tmp; - } - } - } - } - - return event.result; - }, - - dispatch: function( event ) { - - // Make a writable jQuery.Event from the native event object - event = jQuery.event.fix( event ); - - var i, ret, handleObj, matched, j, - handlerQueue = [], - args = slice.call( arguments ), - handlers = ( jQuery._data( this, "events" ) || {} )[ event.type ] || [], - special = jQuery.event.special[ event.type ] || {}; - - // Use the fix-ed jQuery.Event rather than the (read-only) native event - args[0] = event; - event.delegateTarget = this; - - // Call the preDispatch hook for the mapped type, and let it bail if desired - if ( special.preDispatch && special.preDispatch.call( this, event ) === false ) { - return; - } - - // Determine handlers - handlerQueue = jQuery.event.handlers.call( this, event, handlers ); - - // Run delegates first; they may want to stop propagation beneath us - i = 0; - while ( (matched = handlerQueue[ i++ ]) && !event.isPropagationStopped() ) { - event.currentTarget = matched.elem; - - j = 0; - while ( (handleObj = matched.handlers[ j++ ]) && !event.isImmediatePropagationStopped() ) { - - // Triggered event must either 1) have no namespace, or - // 2) have namespace(s) a subset or equal to those in the bound event (both can have no namespace). - if ( !event.namespace_re || event.namespace_re.test( handleObj.namespace ) ) { - - event.handleObj = handleObj; - event.data = handleObj.data; - - ret = ( (jQuery.event.special[ handleObj.origType ] || {}).handle || handleObj.handler ) - .apply( matched.elem, args ); - - if ( ret !== undefined ) { - if ( (event.result = ret) === false ) { - event.preventDefault(); - event.stopPropagation(); - } - } - } - } - } - - // Call the postDispatch hook for the mapped type - if ( special.postDispatch ) { - special.postDispatch.call( this, event ); - } - - return event.result; - }, - - handlers: function( event, handlers ) { - var sel, handleObj, matches, i, - handlerQueue = [], - delegateCount = handlers.delegateCount, - cur = event.target; - - // Find delegate handlers - // Black-hole SVG instance trees (#13180) - // Avoid non-left-click bubbling in Firefox (#3861) - if ( delegateCount && cur.nodeType && (!event.button || event.type !== "click") ) { - - /* jshint eqeqeq: false */ - for ( ; cur != this; cur = cur.parentNode || this ) { - /* jshint eqeqeq: true */ - - // Don't check non-elements (#13208) - // Don't process clicks on disabled elements (#6911, #8165, #11382, #11764) - if ( cur.nodeType === 1 && (cur.disabled !== true || event.type !== "click") ) { - matches = []; - for ( i = 0; i < delegateCount; i++ ) { - handleObj = handlers[ i ]; - - // Don't conflict with Object.prototype properties (#13203) - sel = handleObj.selector + " "; - - if ( matches[ sel ] === undefined ) { - matches[ sel ] = handleObj.needsContext ? - jQuery( sel, this ).index( cur ) >= 0 : - jQuery.find( sel, this, null, [ cur ] ).length; - } - if ( matches[ sel ] ) { - matches.push( handleObj ); - } - } - if ( matches.length ) { - handlerQueue.push({ elem: cur, handlers: matches }); - } - } - } - } - - // Add the remaining (directly-bound) handlers - if ( delegateCount < handlers.length ) { - handlerQueue.push({ elem: this, handlers: handlers.slice( delegateCount ) }); - } - - return handlerQueue; - }, - - fix: function( event ) { - if ( event[ jQuery.expando ] ) { - return event; - } - - // Create a writable copy of the event object and normalize some properties - var i, prop, copy, - type = event.type, - originalEvent = event, - fixHook = this.fixHooks[ type ]; - - if ( !fixHook ) { - this.fixHooks[ type ] = fixHook = - rmouseEvent.test( type ) ? this.mouseHooks : - rkeyEvent.test( type ) ? this.keyHooks : - {}; - } - copy = fixHook.props ? this.props.concat( fixHook.props ) : this.props; - - event = new jQuery.Event( originalEvent ); - - i = copy.length; - while ( i-- ) { - prop = copy[ i ]; - event[ prop ] = originalEvent[ prop ]; - } - - // Support: IE<9 - // Fix target property (#1925) - if ( !event.target ) { - event.target = originalEvent.srcElement || document; - } - - // Support: Chrome 23+, Safari? - // Target should not be a text node (#504, #13143) - if ( event.target.nodeType === 3 ) { - event.target = event.target.parentNode; - } - - // Support: IE<9 - // For mouse/key events, metaKey==false if it's undefined (#3368, #11328) - event.metaKey = !!event.metaKey; - - return fixHook.filter ? fixHook.filter( event, originalEvent ) : event; - }, - - // Includes some event props shared by KeyEvent and MouseEvent - props: "altKey bubbles cancelable ctrlKey currentTarget eventPhase metaKey relatedTarget shiftKey target timeStamp view which".split(" "), - - fixHooks: {}, - - keyHooks: { - props: "char charCode key keyCode".split(" "), - filter: function( event, original ) { - - // Add which for key events - if ( event.which == null ) { - event.which = original.charCode != null ? original.charCode : original.keyCode; - } - - return event; - } - }, - - mouseHooks: { - props: "button buttons clientX clientY fromElement offsetX offsetY pageX pageY screenX screenY toElement".split(" "), - filter: function( event, original ) { - var body, eventDoc, doc, - button = original.button, - fromElement = original.fromElement; - - // Calculate pageX/Y if missing and clientX/Y available - if ( event.pageX == null && original.clientX != null ) { - eventDoc = event.target.ownerDocument || document; - doc = eventDoc.documentElement; - body = eventDoc.body; - - event.pageX = original.clientX + ( doc && doc.scrollLeft || body && body.scrollLeft || 0 ) - ( doc && doc.clientLeft || body && body.clientLeft || 0 ); - event.pageY = original.clientY + ( doc && doc.scrollTop || body && body.scrollTop || 0 ) - ( doc && doc.clientTop || body && body.clientTop || 0 ); - } - - // Add relatedTarget, if necessary - if ( !event.relatedTarget && fromElement ) { - event.relatedTarget = fromElement === event.target ? original.toElement : fromElement; - } - - // Add which for click: 1 === left; 2 === middle; 3 === right - // Note: button is not normalized, so don't use it - if ( !event.which && button !== undefined ) { - event.which = ( button & 1 ? 1 : ( button & 2 ? 3 : ( button & 4 ? 2 : 0 ) ) ); - } - - return event; - } - }, - - special: { - load: { - // Prevent triggered image.load events from bubbling to window.load - noBubble: true - }, - focus: { - // Fire native event if possible so blur/focus sequence is correct - trigger: function() { - if ( this !== safeActiveElement() && this.focus ) { - try { - this.focus(); - return false; - } catch ( e ) { - // Support: IE<9 - // If we error on focus to hidden element (#1486, #12518), - // let .trigger() run the handlers - } - } - }, - delegateType: "focusin" - }, - blur: { - trigger: function() { - if ( this === safeActiveElement() && this.blur ) { - this.blur(); - return false; - } - }, - delegateType: "focusout" - }, - click: { - // For checkbox, fire native event so checked state will be right - trigger: function() { - if ( jQuery.nodeName( this, "input" ) && this.type === "checkbox" && this.click ) { - this.click(); - return false; - } - }, - - // For cross-browser consistency, don't fire native .click() on links - _default: function( event ) { - return jQuery.nodeName( event.target, "a" ); - } - }, - - beforeunload: { - postDispatch: function( event ) { - - // Support: Firefox 20+ - // Firefox doesn't alert if the returnValue field is not set. - if ( event.result !== undefined && event.originalEvent ) { - event.originalEvent.returnValue = event.result; - } - } - } - }, - - simulate: function( type, elem, event, bubble ) { - // Piggyback on a donor event to simulate a different one. - // Fake originalEvent to avoid donor's stopPropagation, but if the - // simulated event prevents default then we do the same on the donor. - var e = jQuery.extend( - new jQuery.Event(), - event, - { - type: type, - isSimulated: true, - originalEvent: {} - } - ); - if ( bubble ) { - jQuery.event.trigger( e, null, elem ); - } else { - jQuery.event.dispatch.call( elem, e ); - } - if ( e.isDefaultPrevented() ) { - event.preventDefault(); - } - } -}; - -jQuery.removeEvent = document.removeEventListener ? - function( elem, type, handle ) { - if ( elem.removeEventListener ) { - elem.removeEventListener( type, handle, false ); - } - } : - function( elem, type, handle ) { - var name = "on" + type; - - if ( elem.detachEvent ) { - - // #8545, #7054, preventing memory leaks for custom events in IE6-8 - // detachEvent needed property on element, by name of that event, to properly expose it to GC - if ( typeof elem[ name ] === strundefined ) { - elem[ name ] = null; - } - - elem.detachEvent( name, handle ); - } - }; - -jQuery.Event = function( src, props ) { - // Allow instantiation without the 'new' keyword - if ( !(this instanceof jQuery.Event) ) { - return new jQuery.Event( src, props ); - } - - // Event object - if ( src && src.type ) { - this.originalEvent = src; - this.type = src.type; - - // Events bubbling up the document may have been marked as prevented - // by a handler lower down the tree; reflect the correct value. - this.isDefaultPrevented = src.defaultPrevented || - src.defaultPrevented === undefined && - // Support: IE < 9, Android < 4.0 - src.returnValue === false ? - returnTrue : - returnFalse; - - // Event type - } else { - this.type = src; - } - - // Put explicitly provided properties onto the event object - if ( props ) { - jQuery.extend( this, props ); - } - - // Create a timestamp if incoming event doesn't have one - this.timeStamp = src && src.timeStamp || jQuery.now(); - - // Mark it as fixed - this[ jQuery.expando ] = true; -}; - -// jQuery.Event is based on DOM3 Events as specified by the ECMAScript Language Binding -// http://www.w3.org/TR/2003/WD-DOM-Level-3-Events-20030331/ecma-script-binding.html -jQuery.Event.prototype = { - isDefaultPrevented: returnFalse, - isPropagationStopped: returnFalse, - isImmediatePropagationStopped: returnFalse, - - preventDefault: function() { - var e = this.originalEvent; - - this.isDefaultPrevented = returnTrue; - if ( !e ) { - return; - } - - // If preventDefault exists, run it on the original event - if ( e.preventDefault ) { - e.preventDefault(); - - // Support: IE - // Otherwise set the returnValue property of the original event to false - } else { - e.returnValue = false; - } - }, - stopPropagation: function() { - var e = this.originalEvent; - - this.isPropagationStopped = returnTrue; - if ( !e ) { - return; - } - // If stopPropagation exists, run it on the original event - if ( e.stopPropagation ) { - e.stopPropagation(); - } - - // Support: IE - // Set the cancelBubble property of the original event to true - e.cancelBubble = true; - }, - stopImmediatePropagation: function() { - var e = this.originalEvent; - - this.isImmediatePropagationStopped = returnTrue; - - if ( e && e.stopImmediatePropagation ) { - e.stopImmediatePropagation(); - } - - this.stopPropagation(); - } -}; - -// Create mouseenter/leave events using mouseover/out and event-time checks -jQuery.each({ - mouseenter: "mouseover", - mouseleave: "mouseout", - pointerenter: "pointerover", - pointerleave: "pointerout" -}, function( orig, fix ) { - jQuery.event.special[ orig ] = { - delegateType: fix, - bindType: fix, - - handle: function( event ) { - var ret, - target = this, - related = event.relatedTarget, - handleObj = event.handleObj; - - // For mousenter/leave call the handler if related is outside the target. - // NB: No relatedTarget if the mouse left/entered the browser window - if ( !related || (related !== target && !jQuery.contains( target, related )) ) { - event.type = handleObj.origType; - ret = handleObj.handler.apply( this, arguments ); - event.type = fix; - } - return ret; - } - }; -}); - -// IE submit delegation -if ( !support.submitBubbles ) { - - jQuery.event.special.submit = { - setup: function() { - // Only need this for delegated form submit events - if ( jQuery.nodeName( this, "form" ) ) { - return false; - } - - // Lazy-add a submit handler when a descendant form may potentially be submitted - jQuery.event.add( this, "click._submit keypress._submit", function( e ) { - // Node name check avoids a VML-related crash in IE (#9807) - var elem = e.target, - form = jQuery.nodeName( elem, "input" ) || jQuery.nodeName( elem, "button" ) ? elem.form : undefined; - if ( form && !jQuery._data( form, "submitBubbles" ) ) { - jQuery.event.add( form, "submit._submit", function( event ) { - event._submit_bubble = true; - }); - jQuery._data( form, "submitBubbles", true ); - } - }); - // return undefined since we don't need an event listener - }, - - postDispatch: function( event ) { - // If form was submitted by the user, bubble the event up the tree - if ( event._submit_bubble ) { - delete event._submit_bubble; - if ( this.parentNode && !event.isTrigger ) { - jQuery.event.simulate( "submit", this.parentNode, event, true ); - } - } - }, - - teardown: function() { - // Only need this for delegated form submit events - if ( jQuery.nodeName( this, "form" ) ) { - return false; - } - - // Remove delegated handlers; cleanData eventually reaps submit handlers attached above - jQuery.event.remove( this, "._submit" ); - } - }; -} - -// IE change delegation and checkbox/radio fix -if ( !support.changeBubbles ) { - - jQuery.event.special.change = { - - setup: function() { - - if ( rformElems.test( this.nodeName ) ) { - // IE doesn't fire change on a check/radio until blur; trigger it on click - // after a propertychange. Eat the blur-change in special.change.handle. - // This still fires onchange a second time for check/radio after blur. - if ( this.type === "checkbox" || this.type === "radio" ) { - jQuery.event.add( this, "propertychange._change", function( event ) { - if ( event.originalEvent.propertyName === "checked" ) { - this._just_changed = true; - } - }); - jQuery.event.add( this, "click._change", function( event ) { - if ( this._just_changed && !event.isTrigger ) { - this._just_changed = false; - } - // Allow triggered, simulated change events (#11500) - jQuery.event.simulate( "change", this, event, true ); - }); - } - return false; - } - // Delegated event; lazy-add a change handler on descendant inputs - jQuery.event.add( this, "beforeactivate._change", function( e ) { - var elem = e.target; - - if ( rformElems.test( elem.nodeName ) && !jQuery._data( elem, "changeBubbles" ) ) { - jQuery.event.add( elem, "change._change", function( event ) { - if ( this.parentNode && !event.isSimulated && !event.isTrigger ) { - jQuery.event.simulate( "change", this.parentNode, event, true ); - } - }); - jQuery._data( elem, "changeBubbles", true ); - } - }); - }, - - handle: function( event ) { - var elem = event.target; - - // Swallow native change events from checkbox/radio, we already triggered them above - if ( this !== elem || event.isSimulated || event.isTrigger || (elem.type !== "radio" && elem.type !== "checkbox") ) { - return event.handleObj.handler.apply( this, arguments ); - } - }, - - teardown: function() { - jQuery.event.remove( this, "._change" ); - - return !rformElems.test( this.nodeName ); - } - }; -} - -// Create "bubbling" focus and blur events -if ( !support.focusinBubbles ) { - jQuery.each({ focus: "focusin", blur: "focusout" }, function( orig, fix ) { - - // Attach a single capturing handler on the document while someone wants focusin/focusout - var handler = function( event ) { - jQuery.event.simulate( fix, event.target, jQuery.event.fix( event ), true ); - }; - - jQuery.event.special[ fix ] = { - setup: function() { - var doc = this.ownerDocument || this, - attaches = jQuery._data( doc, fix ); - - if ( !attaches ) { - doc.addEventListener( orig, handler, true ); - } - jQuery._data( doc, fix, ( attaches || 0 ) + 1 ); - }, - teardown: function() { - var doc = this.ownerDocument || this, - attaches = jQuery._data( doc, fix ) - 1; - - if ( !attaches ) { - doc.removeEventListener( orig, handler, true ); - jQuery._removeData( doc, fix ); - } else { - jQuery._data( doc, fix, attaches ); - } - } - }; - }); -} - -jQuery.fn.extend({ - - on: function( types, selector, data, fn, /*INTERNAL*/ one ) { - var type, origFn; - - // Types can be a map of types/handlers - if ( typeof types === "object" ) { - // ( types-Object, selector, data ) - if ( typeof selector !== "string" ) { - // ( types-Object, data ) - data = data || selector; - selector = undefined; - } - for ( type in types ) { - this.on( type, selector, data, types[ type ], one ); - } - return this; - } - - if ( data == null && fn == null ) { - // ( types, fn ) - fn = selector; - data = selector = undefined; - } else if ( fn == null ) { - if ( typeof selector === "string" ) { - // ( types, selector, fn ) - fn = data; - data = undefined; - } else { - // ( types, data, fn ) - fn = data; - data = selector; - selector = undefined; - } - } - if ( fn === false ) { - fn = returnFalse; - } else if ( !fn ) { - return this; - } - - if ( one === 1 ) { - origFn = fn; - fn = function( event ) { - // Can use an empty set, since event contains the info - jQuery().off( event ); - return origFn.apply( this, arguments ); - }; - // Use same guid so caller can remove using origFn - fn.guid = origFn.guid || ( origFn.guid = jQuery.guid++ ); - } - return this.each( function() { - jQuery.event.add( this, types, fn, data, selector ); - }); - }, - one: function( types, selector, data, fn ) { - return this.on( types, selector, data, fn, 1 ); - }, - off: function( types, selector, fn ) { - var handleObj, type; - if ( types && types.preventDefault && types.handleObj ) { - // ( event ) dispatched jQuery.Event - handleObj = types.handleObj; - jQuery( types.delegateTarget ).off( - handleObj.namespace ? handleObj.origType + "." + handleObj.namespace : handleObj.origType, - handleObj.selector, - handleObj.handler - ); - return this; - } - if ( typeof types === "object" ) { - // ( types-object [, selector] ) - for ( type in types ) { - this.off( type, selector, types[ type ] ); - } - return this; - } - if ( selector === false || typeof selector === "function" ) { - // ( types [, fn] ) - fn = selector; - selector = undefined; - } - if ( fn === false ) { - fn = returnFalse; - } - return this.each(function() { - jQuery.event.remove( this, types, fn, selector ); - }); - }, - - trigger: function( type, data ) { - return this.each(function() { - jQuery.event.trigger( type, data, this ); - }); - }, - triggerHandler: function( type, data ) { - var elem = this[0]; - if ( elem ) { - return jQuery.event.trigger( type, data, elem, true ); - } - } -}); - - -function createSafeFragment( document ) { - var list = nodeNames.split( "|" ), - safeFrag = document.createDocumentFragment(); - - if ( safeFrag.createElement ) { - while ( list.length ) { - safeFrag.createElement( - list.pop() - ); - } - } - return safeFrag; -} - -var nodeNames = "abbr|article|aside|audio|bdi|canvas|data|datalist|details|figcaption|figure|footer|" + - "header|hgroup|mark|meter|nav|output|progress|section|summary|time|video", - rinlinejQuery = / jQuery\d+="(?:null|\d+)"/g, - rnoshimcache = new RegExp("<(?:" + nodeNames + ")[\\s/>]", "i"), - rleadingWhitespace = /^\s+/, - rxhtmlTag = /<(?!area|br|col|embed|hr|img|input|link|meta|param)(([\w:]+)[^>]*)\/>/gi, - rtagName = /<([\w:]+)/, - rtbody = /\s*$/g, - - // We have to close these tags to support XHTML (#13200) - wrapMap = { - option: [ 1, "" ], - legend: [ 1, "
", "
" ], - area: [ 1, "", "" ], - param: [ 1, "", "" ], - thead: [ 1, "", "
" ], - tr: [ 2, "", "
" ], - col: [ 2, "", "
" ], - td: [ 3, "", "
" ], - - // IE6-8 can't serialize link, script, style, or any html5 (NoScope) tags, - // unless wrapped in a div with non-breaking characters in front of it. - _default: support.htmlSerialize ? [ 0, "", "" ] : [ 1, "X
", "
" ] - }, - safeFragment = createSafeFragment( document ), - fragmentDiv = safeFragment.appendChild( document.createElement("div") ); - -wrapMap.optgroup = wrapMap.option; -wrapMap.tbody = wrapMap.tfoot = wrapMap.colgroup = wrapMap.caption = wrapMap.thead; -wrapMap.th = wrapMap.td; - -function getAll( context, tag ) { - var elems, elem, - i = 0, - found = typeof context.getElementsByTagName !== strundefined ? context.getElementsByTagName( tag || "*" ) : - typeof context.querySelectorAll !== strundefined ? context.querySelectorAll( tag || "*" ) : - undefined; - - if ( !found ) { - for ( found = [], elems = context.childNodes || context; (elem = elems[i]) != null; i++ ) { - if ( !tag || jQuery.nodeName( elem, tag ) ) { - found.push( elem ); - } else { - jQuery.merge( found, getAll( elem, tag ) ); - } - } - } - - return tag === undefined || tag && jQuery.nodeName( context, tag ) ? - jQuery.merge( [ context ], found ) : - found; -} - -// Used in buildFragment, fixes the defaultChecked property -function fixDefaultChecked( elem ) { - if ( rcheckableType.test( elem.type ) ) { - elem.defaultChecked = elem.checked; - } -} - -// Support: IE<8 -// Manipulating tables requires a tbody -function manipulationTarget( elem, content ) { - return jQuery.nodeName( elem, "table" ) && - jQuery.nodeName( content.nodeType !== 11 ? content : content.firstChild, "tr" ) ? - - elem.getElementsByTagName("tbody")[0] || - elem.appendChild( elem.ownerDocument.createElement("tbody") ) : - elem; -} - -// Replace/restore the type attribute of script elements for safe DOM manipulation -function disableScript( elem ) { - elem.type = (jQuery.find.attr( elem, "type" ) !== null) + "/" + elem.type; - return elem; -} -function restoreScript( elem ) { - var match = rscriptTypeMasked.exec( elem.type ); - if ( match ) { - elem.type = match[1]; - } else { - elem.removeAttribute("type"); - } - return elem; -} - -// Mark scripts as having already been evaluated -function setGlobalEval( elems, refElements ) { - var elem, - i = 0; - for ( ; (elem = elems[i]) != null; i++ ) { - jQuery._data( elem, "globalEval", !refElements || jQuery._data( refElements[i], "globalEval" ) ); - } -} - -function cloneCopyEvent( src, dest ) { - - if ( dest.nodeType !== 1 || !jQuery.hasData( src ) ) { - return; - } - - var type, i, l, - oldData = jQuery._data( src ), - curData = jQuery._data( dest, oldData ), - events = oldData.events; - - if ( events ) { - delete curData.handle; - curData.events = {}; - - for ( type in events ) { - for ( i = 0, l = events[ type ].length; i < l; i++ ) { - jQuery.event.add( dest, type, events[ type ][ i ] ); - } - } - } - - // make the cloned public data object a copy from the original - if ( curData.data ) { - curData.data = jQuery.extend( {}, curData.data ); - } -} - -function fixCloneNodeIssues( src, dest ) { - var nodeName, e, data; - - // We do not need to do anything for non-Elements - if ( dest.nodeType !== 1 ) { - return; - } - - nodeName = dest.nodeName.toLowerCase(); - - // IE6-8 copies events bound via attachEvent when using cloneNode. - if ( !support.noCloneEvent && dest[ jQuery.expando ] ) { - data = jQuery._data( dest ); - - for ( e in data.events ) { - jQuery.removeEvent( dest, e, data.handle ); - } - - // Event data gets referenced instead of copied if the expando gets copied too - dest.removeAttribute( jQuery.expando ); - } - - // IE blanks contents when cloning scripts, and tries to evaluate newly-set text - if ( nodeName === "script" && dest.text !== src.text ) { - disableScript( dest ).text = src.text; - restoreScript( dest ); - - // IE6-10 improperly clones children of object elements using classid. - // IE10 throws NoModificationAllowedError if parent is null, #12132. - } else if ( nodeName === "object" ) { - if ( dest.parentNode ) { - dest.outerHTML = src.outerHTML; - } - - // This path appears unavoidable for IE9. When cloning an object - // element in IE9, the outerHTML strategy above is not sufficient. - // If the src has innerHTML and the destination does not, - // copy the src.innerHTML into the dest.innerHTML. #10324 - if ( support.html5Clone && ( src.innerHTML && !jQuery.trim(dest.innerHTML) ) ) { - dest.innerHTML = src.innerHTML; - } - - } else if ( nodeName === "input" && rcheckableType.test( src.type ) ) { - // IE6-8 fails to persist the checked state of a cloned checkbox - // or radio button. Worse, IE6-7 fail to give the cloned element - // a checked appearance if the defaultChecked value isn't also set - - dest.defaultChecked = dest.checked = src.checked; - - // IE6-7 get confused and end up setting the value of a cloned - // checkbox/radio button to an empty string instead of "on" - if ( dest.value !== src.value ) { - dest.value = src.value; - } - - // IE6-8 fails to return the selected option to the default selected - // state when cloning options - } else if ( nodeName === "option" ) { - dest.defaultSelected = dest.selected = src.defaultSelected; - - // IE6-8 fails to set the defaultValue to the correct value when - // cloning other types of input fields - } else if ( nodeName === "input" || nodeName === "textarea" ) { - dest.defaultValue = src.defaultValue; - } -} - -jQuery.extend({ - clone: function( elem, dataAndEvents, deepDataAndEvents ) { - var destElements, node, clone, i, srcElements, - inPage = jQuery.contains( elem.ownerDocument, elem ); - - if ( support.html5Clone || jQuery.isXMLDoc(elem) || !rnoshimcache.test( "<" + elem.nodeName + ">" ) ) { - clone = elem.cloneNode( true ); - - // IE<=8 does not properly clone detached, unknown element nodes - } else { - fragmentDiv.innerHTML = elem.outerHTML; - fragmentDiv.removeChild( clone = fragmentDiv.firstChild ); - } - - if ( (!support.noCloneEvent || !support.noCloneChecked) && - (elem.nodeType === 1 || elem.nodeType === 11) && !jQuery.isXMLDoc(elem) ) { - - // We eschew Sizzle here for performance reasons: http://jsperf.com/getall-vs-sizzle/2 - destElements = getAll( clone ); - srcElements = getAll( elem ); - - // Fix all IE cloning issues - for ( i = 0; (node = srcElements[i]) != null; ++i ) { - // Ensure that the destination node is not null; Fixes #9587 - if ( destElements[i] ) { - fixCloneNodeIssues( node, destElements[i] ); - } - } - } - - // Copy the events from the original to the clone - if ( dataAndEvents ) { - if ( deepDataAndEvents ) { - srcElements = srcElements || getAll( elem ); - destElements = destElements || getAll( clone ); - - for ( i = 0; (node = srcElements[i]) != null; i++ ) { - cloneCopyEvent( node, destElements[i] ); - } - } else { - cloneCopyEvent( elem, clone ); - } - } - - // Preserve script evaluation history - destElements = getAll( clone, "script" ); - if ( destElements.length > 0 ) { - setGlobalEval( destElements, !inPage && getAll( elem, "script" ) ); - } - - destElements = srcElements = node = null; - - // Return the cloned set - return clone; - }, - - buildFragment: function( elems, context, scripts, selection ) { - var j, elem, contains, - tmp, tag, tbody, wrap, - l = elems.length, - - // Ensure a safe fragment - safe = createSafeFragment( context ), - - nodes = [], - i = 0; - - for ( ; i < l; i++ ) { - elem = elems[ i ]; - - if ( elem || elem === 0 ) { - - // Add nodes directly - if ( jQuery.type( elem ) === "object" ) { - jQuery.merge( nodes, elem.nodeType ? [ elem ] : elem ); - - // Convert non-html into a text node - } else if ( !rhtml.test( elem ) ) { - nodes.push( context.createTextNode( elem ) ); - - // Convert html into DOM nodes - } else { - tmp = tmp || safe.appendChild( context.createElement("div") ); - - // Deserialize a standard representation - tag = (rtagName.exec( elem ) || [ "", "" ])[ 1 ].toLowerCase(); - wrap = wrapMap[ tag ] || wrapMap._default; - - tmp.innerHTML = wrap[1] + elem.replace( rxhtmlTag, "<$1>" ) + wrap[2]; - - // Descend through wrappers to the right content - j = wrap[0]; - while ( j-- ) { - tmp = tmp.lastChild; - } - - // Manually add leading whitespace removed by IE - if ( !support.leadingWhitespace && rleadingWhitespace.test( elem ) ) { - nodes.push( context.createTextNode( rleadingWhitespace.exec( elem )[0] ) ); - } - - // Remove IE's autoinserted from table fragments - if ( !support.tbody ) { - - // String was a , *may* have spurious - elem = tag === "table" && !rtbody.test( elem ) ? - tmp.firstChild : - - // String was a bare or - wrap[1] === "
" && !rtbody.test( elem ) ? - tmp : - 0; - - j = elem && elem.childNodes.length; - while ( j-- ) { - if ( jQuery.nodeName( (tbody = elem.childNodes[j]), "tbody" ) && !tbody.childNodes.length ) { - elem.removeChild( tbody ); - } - } - } - - jQuery.merge( nodes, tmp.childNodes ); - - // Fix #12392 for WebKit and IE > 9 - tmp.textContent = ""; - - // Fix #12392 for oldIE - while ( tmp.firstChild ) { - tmp.removeChild( tmp.firstChild ); - } - - // Remember the top-level container for proper cleanup - tmp = safe.lastChild; - } - } - } - - // Fix #11356: Clear elements from fragment - if ( tmp ) { - safe.removeChild( tmp ); - } - - // Reset defaultChecked for any radios and checkboxes - // about to be appended to the DOM in IE 6/7 (#8060) - if ( !support.appendChecked ) { - jQuery.grep( getAll( nodes, "input" ), fixDefaultChecked ); - } - - i = 0; - while ( (elem = nodes[ i++ ]) ) { - - // #4087 - If origin and destination elements are the same, and this is - // that element, do not do anything - if ( selection && jQuery.inArray( elem, selection ) !== -1 ) { - continue; - } - - contains = jQuery.contains( elem.ownerDocument, elem ); - - // Append to fragment - tmp = getAll( safe.appendChild( elem ), "script" ); - - // Preserve script evaluation history - if ( contains ) { - setGlobalEval( tmp ); - } - - // Capture executables - if ( scripts ) { - j = 0; - while ( (elem = tmp[ j++ ]) ) { - if ( rscriptType.test( elem.type || "" ) ) { - scripts.push( elem ); - } - } - } - } - - tmp = null; - - return safe; - }, - - cleanData: function( elems, /* internal */ acceptData ) { - var elem, type, id, data, - i = 0, - internalKey = jQuery.expando, - cache = jQuery.cache, - deleteExpando = support.deleteExpando, - special = jQuery.event.special; - - for ( ; (elem = elems[i]) != null; i++ ) { - if ( acceptData || jQuery.acceptData( elem ) ) { - - id = elem[ internalKey ]; - data = id && cache[ id ]; - - if ( data ) { - if ( data.events ) { - for ( type in data.events ) { - if ( special[ type ] ) { - jQuery.event.remove( elem, type ); - - // This is a shortcut to avoid jQuery.event.remove's overhead - } else { - jQuery.removeEvent( elem, type, data.handle ); - } - } - } - - // Remove cache only if it was not already removed by jQuery.event.remove - if ( cache[ id ] ) { - - delete cache[ id ]; - - // IE does not allow us to delete expando properties from nodes, - // nor does it have a removeAttribute function on Document nodes; - // we must handle all of these cases - if ( deleteExpando ) { - delete elem[ internalKey ]; - - } else if ( typeof elem.removeAttribute !== strundefined ) { - elem.removeAttribute( internalKey ); - - } else { - elem[ internalKey ] = null; - } - - deletedIds.push( id ); - } - } - } - } - } -}); - -jQuery.fn.extend({ - text: function( value ) { - return access( this, function( value ) { - return value === undefined ? - jQuery.text( this ) : - this.empty().append( ( this[0] && this[0].ownerDocument || document ).createTextNode( value ) ); - }, null, value, arguments.length ); - }, - - append: function() { - return this.domManip( arguments, function( elem ) { - if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { - var target = manipulationTarget( this, elem ); - target.appendChild( elem ); - } - }); - }, - - prepend: function() { - return this.domManip( arguments, function( elem ) { - if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { - var target = manipulationTarget( this, elem ); - target.insertBefore( elem, target.firstChild ); - } - }); - }, - - before: function() { - return this.domManip( arguments, function( elem ) { - if ( this.parentNode ) { - this.parentNode.insertBefore( elem, this ); - } - }); - }, - - after: function() { - return this.domManip( arguments, function( elem ) { - if ( this.parentNode ) { - this.parentNode.insertBefore( elem, this.nextSibling ); - } - }); - }, - - remove: function( selector, keepData /* Internal Use Only */ ) { - var elem, - elems = selector ? jQuery.filter( selector, this ) : this, - i = 0; - - for ( ; (elem = elems[i]) != null; i++ ) { - - if ( !keepData && elem.nodeType === 1 ) { - jQuery.cleanData( getAll( elem ) ); - } - - if ( elem.parentNode ) { - if ( keepData && jQuery.contains( elem.ownerDocument, elem ) ) { - setGlobalEval( getAll( elem, "script" ) ); - } - elem.parentNode.removeChild( elem ); - } - } - - return this; - }, - - empty: function() { - var elem, - i = 0; - - for ( ; (elem = this[i]) != null; i++ ) { - // Remove element nodes and prevent memory leaks - if ( elem.nodeType === 1 ) { - jQuery.cleanData( getAll( elem, false ) ); - } - - // Remove any remaining nodes - while ( elem.firstChild ) { - elem.removeChild( elem.firstChild ); - } - - // If this is a select, ensure that it displays empty (#12336) - // Support: IE<9 - if ( elem.options && jQuery.nodeName( elem, "select" ) ) { - elem.options.length = 0; - } - } - - return this; - }, - - clone: function( dataAndEvents, deepDataAndEvents ) { - dataAndEvents = dataAndEvents == null ? false : dataAndEvents; - deepDataAndEvents = deepDataAndEvents == null ? dataAndEvents : deepDataAndEvents; - - return this.map(function() { - return jQuery.clone( this, dataAndEvents, deepDataAndEvents ); - }); - }, - - html: function( value ) { - return access( this, function( value ) { - var elem = this[ 0 ] || {}, - i = 0, - l = this.length; - - if ( value === undefined ) { - return elem.nodeType === 1 ? - elem.innerHTML.replace( rinlinejQuery, "" ) : - undefined; - } - - // See if we can take a shortcut and just use innerHTML - if ( typeof value === "string" && !rnoInnerhtml.test( value ) && - ( support.htmlSerialize || !rnoshimcache.test( value ) ) && - ( support.leadingWhitespace || !rleadingWhitespace.test( value ) ) && - !wrapMap[ (rtagName.exec( value ) || [ "", "" ])[ 1 ].toLowerCase() ] ) { - - value = value.replace( rxhtmlTag, "<$1>" ); - - try { - for (; i < l; i++ ) { - // Remove element nodes and prevent memory leaks - elem = this[i] || {}; - if ( elem.nodeType === 1 ) { - jQuery.cleanData( getAll( elem, false ) ); - elem.innerHTML = value; - } - } - - elem = 0; - - // If using innerHTML throws an exception, use the fallback method - } catch(e) {} - } - - if ( elem ) { - this.empty().append( value ); - } - }, null, value, arguments.length ); - }, - - replaceWith: function() { - var arg = arguments[ 0 ]; - - // Make the changes, replacing each context element with the new content - this.domManip( arguments, function( elem ) { - arg = this.parentNode; - - jQuery.cleanData( getAll( this ) ); - - if ( arg ) { - arg.replaceChild( elem, this ); - } - }); - - // Force removal if there was no new content (e.g., from empty arguments) - return arg && (arg.length || arg.nodeType) ? this : this.remove(); - }, - - detach: function( selector ) { - return this.remove( selector, true ); - }, - - domManip: function( args, callback ) { - - // Flatten any nested arrays - args = concat.apply( [], args ); - - var first, node, hasScripts, - scripts, doc, fragment, - i = 0, - l = this.length, - set = this, - iNoClone = l - 1, - value = args[0], - isFunction = jQuery.isFunction( value ); - - // We can't cloneNode fragments that contain checked, in WebKit - if ( isFunction || - ( l > 1 && typeof value === "string" && - !support.checkClone && rchecked.test( value ) ) ) { - return this.each(function( index ) { - var self = set.eq( index ); - if ( isFunction ) { - args[0] = value.call( this, index, self.html() ); - } - self.domManip( args, callback ); - }); - } - - if ( l ) { - fragment = jQuery.buildFragment( args, this[ 0 ].ownerDocument, false, this ); - first = fragment.firstChild; - - if ( fragment.childNodes.length === 1 ) { - fragment = first; - } - - if ( first ) { - scripts = jQuery.map( getAll( fragment, "script" ), disableScript ); - hasScripts = scripts.length; - - // Use the original fragment for the last item instead of the first because it can end up - // being emptied incorrectly in certain situations (#8070). - for ( ; i < l; i++ ) { - node = fragment; - - if ( i !== iNoClone ) { - node = jQuery.clone( node, true, true ); - - // Keep references to cloned scripts for later restoration - if ( hasScripts ) { - jQuery.merge( scripts, getAll( node, "script" ) ); - } - } - - callback.call( this[i], node, i ); - } - - if ( hasScripts ) { - doc = scripts[ scripts.length - 1 ].ownerDocument; - - // Reenable scripts - jQuery.map( scripts, restoreScript ); - - // Evaluate executable scripts on first document insertion - for ( i = 0; i < hasScripts; i++ ) { - node = scripts[ i ]; - if ( rscriptType.test( node.type || "" ) && - !jQuery._data( node, "globalEval" ) && jQuery.contains( doc, node ) ) { - - if ( node.src ) { - // Optional AJAX dependency, but won't run scripts if not present - if ( jQuery._evalUrl ) { - jQuery._evalUrl( node.src ); - } - } else { - jQuery.globalEval( ( node.text || node.textContent || node.innerHTML || "" ).replace( rcleanScript, "" ) ); - } - } - } - } - - // Fix #11809: Avoid leaking memory - fragment = first = null; - } - } - - return this; - } -}); - -jQuery.each({ - appendTo: "append", - prependTo: "prepend", - insertBefore: "before", - insertAfter: "after", - replaceAll: "replaceWith" -}, function( name, original ) { - jQuery.fn[ name ] = function( selector ) { - var elems, - i = 0, - ret = [], - insert = jQuery( selector ), - last = insert.length - 1; - - for ( ; i <= last; i++ ) { - elems = i === last ? this : this.clone(true); - jQuery( insert[i] )[ original ]( elems ); - - // Modern browsers can apply jQuery collections as arrays, but oldIE needs a .get() - push.apply( ret, elems.get() ); - } - - return this.pushStack( ret ); - }; -}); - - -var iframe, - elemdisplay = {}; - -/** - * Retrieve the actual display of a element - * @param {String} name nodeName of the element - * @param {Object} doc Document object - */ -// Called only from within defaultDisplay -function actualDisplay( name, doc ) { - var style, - elem = jQuery( doc.createElement( name ) ).appendTo( doc.body ), - - // getDefaultComputedStyle might be reliably used only on attached element - display = window.getDefaultComputedStyle && ( style = window.getDefaultComputedStyle( elem[ 0 ] ) ) ? - - // Use of this method is a temporary fix (more like optmization) until something better comes along, - // since it was removed from specification and supported only in FF - style.display : jQuery.css( elem[ 0 ], "display" ); - - // We don't have any data stored on the element, - // so use "detach" method as fast way to get rid of the element - elem.detach(); - - return display; -} - -/** - * Try to determine the default display value of an element - * @param {String} nodeName - */ -function defaultDisplay( nodeName ) { - var doc = document, - display = elemdisplay[ nodeName ]; - - if ( !display ) { - display = actualDisplay( nodeName, doc ); - - // If the simple way fails, read from inside an iframe - if ( display === "none" || !display ) { - - // Use the already-created iframe if possible - iframe = (iframe || jQuery( " + +
+

本特殊页面展示所有上传的文件。 +

+
文件列表
+
+
+
首页上一页下一页末页
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
日期名称缩⁠略⁠图大小用户描述
2012年2月4日 (六) 02:17Ramsay Bolton.jpg文件32 KBReasno 
2012年2月4日 (六) 05:26Sansa Stark by AniaEm.jpg文件30 KB泰温公爵 
2012年2月4日 (六) 05:26Sansa by akizhao.jpg文件50 KB泰温公爵 
2012年2月4日 (六) 06:33300px-Arya Stark by AniaEm.jpg文件26 KBThinkmac 
2012年2月4日 (六) 06:43FileHouse Stark.png文件4 KBThinkmac 
2012年2月4日 (六) 06:59Arya stark by daaria.jpg文件125 KBThinkmac 
2012年2月4日 (六) 07:1050px-House Reyne.png文件3 KBRubilacxe 
+首页上一页下一页末页 + + + + + + + + + +
+ + +
+
+ + + + + + + + + + +
+
+
+ + + + + + + \ No newline at end of file diff --git a/tests/data/html_regexs/commons.moegirl.org.cn-20230701.html b/tests/data/html_regexs/commons.moegirl.org.cn-20230701.html new file mode 100644 index 00000000..174f2130 --- /dev/null +++ b/tests/data/html_regexs/commons.moegirl.org.cn-20230701.html @@ -0,0 +1,250 @@ + + + + +文件列表 - 萌娘共享 + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ +

+

+
+

文件列表

+
+
+ 跳转至: 导航搜索 +
+
+

本特殊页面展示所有上传的文件。 +

+
+
+文件列表 + + +
+
+
+
  +
+ + + + + +
+
首页上一页下一页末页 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
日期名称缩略图尺寸用户说明版本
2010年10月16日 (六) 18:04Cat ico47.png文件887字节Baskice 1
2010年10月16日 (六) 18:04Cat ico20.png文件722字节Baskice 1
2010年10月16日 (六) 18:05Cat ico21.png文件790字节Baskice 1
2010年10月17日 (日) 12:41Bouncywikilogo.gif文件41 KBBaskice维基百科标志1
2010年10月28日 (四) 13:44Ambox content.png文件3 KBBaskice 1
2010年10月28日 (四) 13:578 cao ni ma.jpg文件32 KBBaskice{{不完整}} 分类:绿坝娘 分类:图片1
2010年10月28日 (四) 14:07Bath bathtub blush nopan panties ryuuzaki itsu striped panties thighhighs underwear.jpg文件996 KBBaskice{{缺乏信息}} {{模板:作者|未知|PIXIV}} 分类:绿坝娘 分类:图片1
+首页上一页下一页末页
+
+
+
+
+

导航菜单

+
+ +
+ + +
+
+ + + +
+
+ +
+ + + + diff --git a/tests/data/html_regexs/group0.mediawiki.demo.save-web.org_mediawiki-1.16.5-20230701.html b/tests/data/html_regexs/group0.mediawiki.demo.save-web.org_mediawiki-1.16.5-20230701.html new file mode 100644 index 00000000..f978527f --- /dev/null +++ b/tests/data/html_regexs/group0.mediawiki.demo.save-web.org_mediawiki-1.16.5-20230701.html @@ -0,0 +1,174 @@ + + + +文件列表 - 1165哈😂 + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +

文件列表

+
+

出自1165哈😂

+
+
跳转到: 导航, 搜索
+ +
+

这个特殊页面显示所有已上传文件。 +默认设置中,最后上传的文件会显示在这个列表的顶端。 +

+点击任一列标题可修改排序方式。
+
文件列表
+  + +
+
+ + + + + + + + + + +
降日期名称用户大小描述版本
2023年1月11日 (三) 19:05Screenshot_20230112025903.jpg (文件)Saveweb8 KB 1
2023年1月10日 (二) 14:31截图_2023-01-10_22-29-18.png (文件)Saveweb117 KB (hulahula)1
+
+ + +
+
+
+
+
+
查看
+
+ +
+
+
+
个人工具
+ +
+ + + + +
+
工具箱
+
+ +
+
+
+
+ +
+ + + diff --git a/tests/data/html_regexs/group0.mediawiki.demo.save-web.org_mediawiki-1.23.17-20230701.html b/tests/data/html_regexs/group0.mediawiki.demo.save-web.org_mediawiki-1.23.17-20230701.html new file mode 100644 index 00000000..5e09c711 --- /dev/null +++ b/tests/data/html_regexs/group0.mediawiki.demo.save-web.org_mediawiki-1.23.17-20230701.html @@ -0,0 +1,184 @@ + + + + +文件列表 - 12317 哈|「、」‘/-&^%$%@😒 + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + +

文件列表

+
+
+
+ 跳转至: 导航搜索 +
+
+

本特殊页面展示所有上传的文件。 +

+
+
+文件列表 +
每页项数:
按媒体名称搜索:
用户名:
包括图片的旧版本
+
+ + + + + + + + + + + + + + + + + + + +
降日期名称缩略图尺寸用户说明版本
2023年1月10日 (二) 22:26Tibet.png文件730 KBSavewebrszdggrsrergsrsgrsgz1
+
+
+
+
+
+
+

导航菜单

+
+ +
+ + +
+
+ + + +
+
+
+ + + +
+
+ + + + + + diff --git a/tests/data/html_regexs/group1.mediawiki.demo.save-web.org_mediawiki-1.27.7-20230701.html b/tests/data/html_regexs/group1.mediawiki.demo.save-web.org_mediawiki-1.27.7-20230701.html new file mode 100644 index 00000000..cd2ea12e --- /dev/null +++ b/tests/data/html_regexs/group1.mediawiki.demo.save-web.org_mediawiki-1.27.7-20230701.html @@ -0,0 +1,184 @@ + + + + +文件列表 - 12317 哈|「、」‘/-&^%$%@😒 + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + +

文件列表

+
+
+
+ 跳转至: 导航搜索 +
+
+

本特殊页面展示所有上传的文件。 +

+
+
+文件列表 +
每页项数:
按媒体名称搜索:
用户名:
包括图片的旧版本
+
+ + + + + + + + + + + + + + + + + + + +
降日期名称缩略图尺寸用户说明版本
2023年1月10日 (二) 22:26Tibet.png文件730 KBSavewebrszdggrsrergsrsgrsgz1
+
+
+
+
+
+
+

导航菜单

+
+ +
+ + +
+
+ + + +
+
+
+ + + +
+
+ + + + + + diff --git a/tests/data/html_regexs/group2.mediawiki.demo.save-web.org_mediawiki-1.39.1-20230701.html b/tests/data/html_regexs/group2.mediawiki.demo.save-web.org_mediawiki-1.39.1-20230701.html new file mode 100644 index 00000000..23a27bd0 --- /dev/null +++ b/tests/data/html_regexs/group2.mediawiki.demo.save-web.org_mediawiki-1.39.1-20230701.html @@ -0,0 +1,66 @@ + + + + +文件列表 - 1391 current ! Nice + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+

文件列表

来自1391 current ! Nice
+

本特殊页面展示所有上传的文件。 +

+
文件列表
+
+
+
+ + + + + + + + + + + + + + + + + + +
日期名称缩⁠略⁠图大小用户描述版本
2023年1月18日 (三) 21:24--*"?---------boom.png文件78 KBSavewebv22
+
+
+ + \ No newline at end of file diff --git a/tests/data/html_regexs/mediawiki.org-20230701.html b/tests/data/html_regexs/mediawiki.org-20230701.html new file mode 100644 index 00000000..3f2d479f --- /dev/null +++ b/tests/data/html_regexs/mediawiki.org-20230701.html @@ -0,0 +1,555 @@ + + + + +File list - MediaWiki + + + + + + + + + + + + + + + + + + + + + + + + + + +Jump to content +
+
+
+ + + + +
+
+ + + + + +
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+

File list

+ + +
+ +
+
+
+
+
+ +
+
+ + + +
+
+
+
+ +
+
+
+ + +
+
+ + +
+

This special page shows all uploaded files. +

+
File list
+
+
+
First pagePrevious pageNext pageLast page + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DateNameThumbnailSizeUserDescription
10:33, 2 December 2006Example.jpg (file)60 KBBdksmaller version
21:00, 24 February 2007BadImageList.png (file)51 KBRobchurchCrop whitespace from the right of the screenshot
10:36, 30 April 2007M-fi-recentchanges.png (file)130 KBCimon Avaroscreencapture for the PD-helpfiles
14:54, 14 May 2007NiceCategoryList2 extension example.jpg (file)28 KBJohanTheGhostExample image (partial screenshot) demonstrating the NiceCategoryList2 extension. Made by me, JohanTheGhost, 14 May, 2007.
15:56, 15 May 2007NiceCategoryList2 extension default.jpg (file)23 KBJohanTheGhost== Summary == Example image (partial screenshot) demonstrating the NiceCategoryList2 extension in its default mode. Made by me, JohanTheGhost, 15 May, 2007. == Licensing == {{GFDL}}
15:57, 15 May 2007NiceCategoryList2 extension full.jpg (file)20 KBJohanTheGhost== Summary == Example image (partial screenshot) demonstrating the NiceCategoryList2 extension used to generate a full listing. Made by me, JohanTheGhost, 15 May, 2007. == Licensing == {{GFDL}}
03:06, 24 June 2007TinyMCE MW.PNG (file)124 KBJoeSoxTinyMCE_MW extension Screenshots
+First pagePrevious pageNext pageLast page +
+ +
+
+ +
+ +
+
+
+ + + +
+ + + \ No newline at end of file diff --git a/tests/data/html_regexs/mediawiki.org-20240924-zh-rCN.html b/tests/data/html_regexs/mediawiki.org-20240924-zh-rCN.html new file mode 100644 index 00000000..25c50744 --- /dev/null +++ b/tests/data/html_regexs/mediawiki.org-20240924-zh-rCN.html @@ -0,0 +1,704 @@ + + + + +文件列表 - MediaWiki + + + + + + + + + + + + + + + + + + + + + + + + +跳转到内容 +
+
+
+ + + + +
+
+ + + + + +
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+

文件列表

+
+ +
+
+
+
+
+ +
+
+ + + +
+
+
+
+
+ + +
+
+
+
+ + +
+
+ + +
+

本特殊页面展示所有上传的文件。 +

+
文件列表
+
+
+
首页上一页下一页末页 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
日期名称缩⁠略⁠图大小用户描述
2006年12月2日 (六) 10:33Example.jpg文件60 KBBdk留言 | 贡献smaller version
2007年2月24日 (六) 21:00BadImageList.png文件51 KBRobchurch留言 | 贡献Crop whitespace from the right of the screenshot
2007年4月30日 (一) 10:36M-fi-recentchanges.png文件130 KBCimon Avaro留言 | 贡献screencapture for the PD-helpfiles
2007年5月14日 (一) 14:54NiceCategoryList2 extension example.jpg文件28 KBJohanTheGhost留言 | 贡献Example image (partial screenshot) demonstrating the NiceCategoryList2 extension. Made by me, JohanTheGhost, 14 May, 2007.
2007年5月15日 (二) 15:56NiceCategoryList2 extension default.jpg文件23 KBJohanTheGhost留言 | 贡献== Summary == Example image (partial screenshot) demonstrating the NiceCategoryList2 extension in its default mode. Made by me, JohanTheGhost, 15 May, 2007. == Licensing == {{GFDL}}
2007年5月15日 (二) 15:57NiceCategoryList2 extension full.jpg文件20 KBJohanTheGhost留言 | 贡献== Summary == Example image (partial screenshot) demonstrating the NiceCategoryList2 extension used to generate a full listing. Made by me, JohanTheGhost, 15 May, 2007. == Licensing == {{GFDL}}
2007年6月24日 (日) 03:06TinyMCE MW.PNG文件124 KBJoeSox留言 | 贡献TinyMCE_MW extension Screenshots
+首页上一页下一页末页 +
+ +
+
+ +
+ +
+
+
+
+
+ + + +
+ +
+
+ +
+
+
+
    +
    + +

    语言已自English更改

    \ No newline at end of file diff --git a/tests/data/html_regexs/mediawiki.org-20240924.html b/tests/data/html_regexs/mediawiki.org-20240924.html new file mode 100644 index 00000000..441f27b0 --- /dev/null +++ b/tests/data/html_regexs/mediawiki.org-20240924.html @@ -0,0 +1,612 @@ + + + + +File list - MediaWiki + + + + + + + + + + + + + + + + + + + + + + + + + +Jump to content +
    +
    +
    + + + + +
    +
    + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    File list

    +
    + +
    +
    +
    +
    +
    + +
    +
    + + + +
    +
    +
    +
    +
    + + +
    +
    +
    +
    + + +
    +
    + + +
    +

    This special page shows all uploaded files. +

    +
    File list
    +
    +
    +
    First pagePrevious pageNext pageLast page + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    DateNameThumbnailSizeUserDescription
    10:33, 2 December 2006Example.jpg (file)60 KBBdk (talk | contribs)smaller version
    21:00, 24 February 2007BadImageList.png (file)51 KBRobchurch (talk | contribs)Crop whitespace from the right of the screenshot
    10:36, 30 April 2007M-fi-recentchanges.png (file)130 KBCimon Avaro (talk | contribs)screencapture for the PD-helpfiles
    14:54, 14 May 2007NiceCategoryList2 extension example.jpg (file)28 KBJohanTheGhost (talk | contribs)Example image (partial screenshot) demonstrating the NiceCategoryList2 extension. Made by me, JohanTheGhost, 14 May, 2007.
    15:56, 15 May 2007NiceCategoryList2 extension default.jpg (file)23 KBJohanTheGhost (talk | contribs)== Summary == Example image (partial screenshot) demonstrating the NiceCategoryList2 extension in its default mode. Made by me, JohanTheGhost, 15 May, 2007. == Licensing == {{GFDL}}
    15:57, 15 May 2007NiceCategoryList2 extension full.jpg (file)20 KBJohanTheGhost (talk | contribs)== Summary == Example image (partial screenshot) demonstrating the NiceCategoryList2 extension used to generate a full listing. Made by me, JohanTheGhost, 15 May, 2007. == Licensing == {{GFDL}}
    03:06, 24 June 2007TinyMCE MW.PNG (file)124 KBJoeSox (talk | contribs)TinyMCE_MW extension Screenshots
    +First pagePrevious pageNext pageLast page +
    + +
    +
    + +
    + +
    +
    +
    +
      +
      + + \ No newline at end of file diff --git a/tests/data/html_regexs/wiki.othing.xyz-20230701.html b/tests/data/html_regexs/wiki.othing.xyz-20230701.html new file mode 100644 index 00000000..92a79d7f --- /dev/null +++ b/tests/data/html_regexs/wiki.othing.xyz-20230701.html @@ -0,0 +1,290 @@ + + + + +文件列表 - 互联网公墓 + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      + +
      +
      + +
      +

      文件列表

      +
      + +
      +
      + +
      + 跳到导航 + 跳到搜索 +
      +

      本特殊页面展示所有上传的文件。 +

      +
      文件列表
      +
      +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      日期名称缩⁠略⁠图尺⁠寸用户说明版本
      2021年8月10日 (二) 00:27Dvedit logo.gif文件3 KBYzqzss 1
      2021年8月10日 (二) 12:24Iamwu555.jpg文件51 KBYzqzss 1
      2021年8月10日 (二) 13:43Save the web.png文件11 KBYzqzss 1
      2021年8月19日 (四) 20:54Shoujileyuan close1.jpg文件35 KBYzqzss 1
      2021年8月19日 (四) 20:57Shoujileyuan close2.jpg文件31 KBYzqzss 1
      2021年8月19日 (四) 21:10Shoujileyuan qq group closed.jpg文件32 KBYzqzss 1
      2021年8月19日 (四) 21:29Shoujileyuan logo.png文件6 KBYzqzss 1
      2021年8月20日 (五) 01:49Wiki.png文件1 KBYzqzss本站logo1
      2021年9月3日 (五) 17:33晒书房 logo.png文件4 KBYzqzss 1
      + +
      + +
      +
      + +
      +

      导航菜单

      +
      + + + +
      + + + + + + +
      +
      + + + + + + + + +
      +
      + +
      + + + + + + + + +
      + +
      + + + + + \ No newline at end of file diff --git a/tests/html_regexs_test.py b/tests/html_regexs_test.py new file mode 100644 index 00000000..0379bb85 --- /dev/null +++ b/tests/html_regexs_test.py @@ -0,0 +1,106 @@ +import os +from pathlib import Path +import re +from urllib.parse import unquote +from typing import Dict + +import requests +import pytest + +from wikiteam3.dumpgenerator.dump.image.html_regexs import REGEX_CANDIDATES +from wikiteam3.utils.util import undo_HTML_entities + +ONLINE = False + +HTML_DIR = Path(__file__).parent / "data/html_regexs" +os.makedirs(HTML_DIR, exist_ok=True) + +def prepare_raws_from_urls(urls: Dict[str, str]): + sess = requests.Session() + raws: Dict[str, str] = {} + for site, url in urls.items(): + try: + resp = sess.get(url, timeout=10, allow_redirects=True) + except Exception as e: + pytest.warns(UserWarning, match=f"Could not fetch {url}: {e}") + continue + + if resp.status_code == 200: + raws[url] = resp.text + if not os.path.exists(HTML_DIR / f"{site}.html"): + with open(HTML_DIR / f"{site}.html", "w", encoding="utf-8") as f: + f.write(resp.text) + else: + pytest.warns(UserWarning, match=f"Could not fetch {url}: status_code: {resp.status_code}") + + return raws + +class TestRegexs: + class TestRegexsOnline: + listFiles_urls = { + # site-date: url , `limit=` for counting the number of matches + "group0.mediawiki.demo.save-web.org_mediawiki-1.16.5-20230701": "http://group0.mediawiki.demo.save-web.org/mediawiki-1.16.5/index.php?title=特殊:文件列表&limit=2", + "group2.mediawiki.demo.save-web.org_mediawiki-1.39.1-20230701": "http://group2.mediawiki.demo.save-web.org/mediawiki-1.39.1/index.php?title=Special:ListFiles&limit=1", + "archiveteam.org-20230701": "https://wiki.archiveteam.org/index.php?title=Special:ListFiles&sort=byname&limit=7", + "wiki.othing.xyz-20230701": "https://wiki.othing.xyz/index.php?title=Special:ListFiles&sort=byname", + "mediawiki.org-20230701": "https://www.mediawiki.org/w/index.php?title=Special:ListFiles&sort=byname&limit=7", + "asoiaf.fandom.com-20230701": "https://asoiaf.fandom.com/zh/wiki/Special:文件列表?sort=byname&limit=7", + "mediawiki.org-20240924": "https://www.mediawiki.org/w/index.php?title=Special:ListFiles&sort=byname&limit=7" + + # only for local testing: + # "commons.moegirl.org.cn-20230701": "https://commons.moegirl.org.cn/index.php?title=Special:ListFiles&sort=byname&limit=7", + # # login required: + # "group0.mediawiki.demo.save-web.org_mediawiki-1.23.17-20230701": "http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/index.php?title=Special:文件列表&limit=1", + # "group1.mediawiki.demo.save-web.org_mediawiki-1.27.7-20230701": "http://group1.mediawiki.demo.save-web.org/mediawiki-1.27.7/index.php?title=Special:ListFiles&limit=2", + } + raws: Dict[str, str] = {} + def test_online(self): + if not ONLINE: + pytest.skip("Online test skipped") + self.raws = prepare_raws_from_urls(self.listFiles_urls) + assert len(self.raws) != 0, "Could not fetch any of the URLs" + for url, raw in self.raws.items(): + best_matched = 0 + regexp_best = None + + for index, regexp in enumerate(REGEX_CANDIDATES): + _count = len(re.findall(regexp, raw)) + if _count > best_matched: + best_matched = _count + regexp_best = regexp + + assert regexp_best is not None, f"Could not find a proper regexp to parse the HTML for {url} (online)" + + if "limit=" in url: + limit = int(url.split("limit=")[-1]) + assert len(re.findall(regexp_best, raw)) == limit, f"Could not find {limit} matches for {url} (online)" + + + class TestRegexsOffline: + html_files = os.listdir(HTML_DIR) + raws: Dict[str, str] = {} + for html_file in html_files: + with open(HTML_DIR / html_file, "r", encoding="utf-8") as f: + raws[html_file] = f.read() + assert len(raws) != 0, f"Could not find any HTML files in {HTML_DIR}" + raws_items = raws.items() + + @pytest.mark.parametrize('site, html_data', raws_items) + def test_offline(self, site, html_data): + best_matched = 0 + regexp_best = None + + for index, regexp in enumerate(REGEX_CANDIDATES): + _count = len(re.findall(regexp, html_data)) + if _count > best_matched: + best_matched = _count + regexp_best = regexp + + print("site", site, "best_matched", best_matched) + assert regexp_best is not None, f"Could not find a proper regexp to parse the HTML for {site} (local)" + + for i in re.compile(regexp_best).finditer(html_data): + url, filename, uploader = i.group("url"),\ + unquote(undo_HTML_entities(i.group("filename"))),\ + unquote(undo_HTML_entities(i.group("uploader"))) + print({"url": url, "filename": filename, "uploader": uploader}) diff --git a/tests/site_info_test.py b/tests/site_info_test.py new file mode 100644 index 00000000..d1c50dff --- /dev/null +++ b/tests/site_info_test.py @@ -0,0 +1,17 @@ +import json + +import pytest +import requests + +from wikiteam3.dumpgenerator.dump.misc.site_info import save_siteinfo + +from tests.test_config import get_config + +def test_mediawiki_1_16(): + pytest.skip("Temporarily down") + with get_config('1.16.5') as config: + sess = requests.Session() + save_siteinfo(config, sess) + with open(config.path + '/siteinfo.json', 'r') as f: + siteInfoJson = json.load(f) + assert siteInfoJson['query']['general']['generator'] == "MediaWiki 1.16.5" diff --git a/tests/test_bad_ssl.py b/tests/test_bad_ssl.py new file mode 100644 index 00000000..7c4f9b98 --- /dev/null +++ b/tests/test_bad_ssl.py @@ -0,0 +1,94 @@ + +import warnings + +import requests +import pytest +from urllib3.exceptions import InsecureRequestWarning + +from wikiteam3.utils.monkey_patch import WakeTLSAdapter + +def _get_session(): + session = requests.Session() + session.verify = False + requests.packages.urllib3.disable_warnings() # type: ignore + for protocol in ['http://', 'https://']: + session.mount(protocol, WakeTLSAdapter()) + return session + +session = None + +badssl_ok_urls = [ + "https://expired.badssl.com/", + "https://wrong.host.badssl.com/", + "https://self-signed.badssl.com/", + "https://untrusted-root.badssl.com/", + "https://revoked.badssl.com/", + "https://pinning-test.badssl.com/", + + "https://no-common-name.badssl.com", + "https://incomplete-chain.badssl.com", + "https://no-subject.badssl.com", + + "https://mozilla-old.badssl.com", + "https://null.badssl.com", + + "https://dh1024.badssl.com", + "https://dh2048.badssl.com", + + "https://dh-small-subgroup.badssl.com", + "https://dh-composite.badssl.com", + + "https://tls-v1-0.badssl.com:1010", + "https://tls-v1-1.badssl.com:1011", + "https://tls-v1-2.badssl.com:1012", + + "https://no-sct.badssl.com", + + "https://subdomain.preloaded-hsts.badssl.com", + "https://superfish.badssl.com", + "https://dsdtestprovider.badssl.com", + "https://preact-cli.badssl.com", + "https://webpack-dev-server.badssl.com", + + "https://captive-portal.badssl.com", + "https://mitm-software.badssl.com", + + "https://sha1-2016.badssl.com", + "https://sha1-2017.badssl.com", + "https://sha1-intermediate.badssl.com", + "https://invalid-expected-sct.badssl.com", + +] +@pytest.mark.parametrize("url", badssl_ok_urls) +def test_the_badssl_ok(url): + global session + session = session or _get_session() + resp = None + with warnings.catch_warnings(): + warnings.filterwarnings("ignore",category=InsecureRequestWarning) + try: + resp = session.get(url, timeout=20) + except Exception as e: + pytest.fail(f"Could not fetch {url}: {e}") + + assert resp is not None, f"Could not fetch {url}" + +badssl_may_fail_urls = [ + "https://rc4-md5.badssl.com", + "https://rc4.badssl.com", + "https://3des.badssl.com", + + "https://dh480.badssl.com", + "https://dh512.badssl.com", +] +@pytest.mark.parametrize("url", badssl_may_fail_urls) +def test_the_badssl_may_fail(url): + global session + session = session or _get_session() + resp = None + with warnings.catch_warnings(): + warnings.filterwarnings("ignore",category=InsecureRequestWarning) + try: + resp = session.get(url, timeout=20) + except Exception as e: + pytest.skip("This test is expected to fail on default OpenSSL configuration") diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 00000000..5e6c9f55 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,30 @@ +import copy +import tempfile +from contextlib import contextmanager + +from wikiteam3.dumpgenerator.cli import get_parameters +from wikiteam3.dumpgenerator.config import new_config + +CONFIG_CACHE = {} + +@contextmanager +def _new_config_from_parameter(params): + _params = tuple(params) + if _params in CONFIG_CACHE: + return CONFIG_CACHE[_params] + config, _ = get_parameters(['--path=.', '--xml'] + list(params)) + CONFIG_CACHE[_params] = config + _config = new_config(copy.deepcopy(config.asdict())) + try: + with tempfile.TemporaryDirectory(prefix='wikiteam3test_') as tmpdir: + _config.path = tmpdir + yield _config + finally: + pass + +def get_config(mediawiki_ver, api=True): + assert api is True + if mediawiki_ver == '1.16.5': + return _new_config_from_parameter([ + "--api", "http://group0.mediawiki.demo.save-web.org/mediawiki-1.16.5/api.php", + ]) diff --git a/tests/test_image.py b/tests/test_image.py new file mode 100644 index 00000000..dbcffb97 --- /dev/null +++ b/tests/test_image.py @@ -0,0 +1,36 @@ +import pytest + +from wikiteam3.dumpgenerator.dump.image.image import Image + +# https://github.com/saveweb/wikiteam3/issues/51 + +def test_get_image_names_API_spaces2underscore(monkeypatch): + class DummyConfig: + api = 'https://example.com/api.php' + api_chunksize = 10 + index = 'https://example.com/index.php' + class DummySession: + def get(self, url, params=None, timeout=None): + class DummyResponse: + def __init__(self): + self.status_code = 200 + self.headers = {} + def json(self): + return { + 'query': { + 'allimages': [ + {'name': 'Video:Fatal 5 combo video', 'url': 'https://example.com/1', 'user': 'User A'}, + {'name': 'Test_image_with_underscore', 'url': 'https://example.com/2', 'user': 'User B'}, + ] + } + } + return DummyResponse() + # Patch handle_StatusCode/get_JSON/Delay to be no-ops + monkeypatch.setattr('wikiteam3.dumpgenerator.dump.image.image.handle_StatusCode', lambda r: None) + monkeypatch.setattr('wikiteam3.dumpgenerator.dump.image.image.get_JSON', lambda r: r.json()) + monkeypatch.setattr('wikiteam3.dumpgenerator.dump.image.image.Delay', lambda config: None) + result = Image.get_image_names_API(DummyConfig(), DummySession()) + # The result is a array, filename is at index 0 + filenames = [row[0] for row in result] + assert all(' ' not in fn for fn in filenames) + assert 'Video:Fatal_5_combo_video' in filenames \ No newline at end of file diff --git a/tests/test_image_fandom_png_skip.py b/tests/test_image_fandom_png_skip.py new file mode 100644 index 00000000..4db60420 --- /dev/null +++ b/tests/test_image_fandom_png_skip.py @@ -0,0 +1,52 @@ +# Test for Fandom PNG skip logic in images_mismatch when resuming +import pytest +from unittest.mock import MagicMock, patch +from pathlib import Path +import builtins + +from wikiteam3.dumpgenerator.dump.image.image import Image + +class DummyConfig: + api = "https://spongebob.fandom.com/api.php" + path = "/tmp/testdump" + +class DummyOtherConfig: + image_timestamp_interval = None + ia_wbm_booster = None + hard_retries = 0 + +@pytest.fixture +def fandom_png_in_mismatch(tmp_path): + # Setup test directory and file + images_mismatch = tmp_path / "images_mismatch" + images_mismatch.mkdir() + png_name = "Test_Image.png" + (images_mismatch / png_name).write_bytes(b"fakepngdata") + return tmp_path, png_name + +def test_fandom_png_skip(monkeypatch, fandom_png_in_mismatch, capsys): + tmp_path, png_name = fandom_png_in_mismatch + config = DummyConfig() + config.path = str(tmp_path) + other = DummyOtherConfig() + # Fandom PNG in images_mismatch + images = [[png_name, "https://static.wikia.nocookie.net/abcdefg/images/1/11/Test_Image.png?cb=20210101010101", "Uploader", "123", "fake_sha1", "2021-01-01T01:01:01Z"]] + + # Patch Path.is_file to return True only for the images_mismatch PNG + orig_is_file = Path.is_file + def is_file_patch(self): + if str(self) == str(tmp_path / "images_mismatch" / png_name): + return True + return orig_is_file(self) + monkeypatch.setattr(Path, "is_file", is_file_patch) + + # Patch print to capture output + with patch.object(builtins, "print") as mock_print: + # Should skip and not attempt download + Image.generate_image_dump(config, other, images, MagicMock()) + # Check that skip message was printed + found = False + for call in mock_print.call_args_list: + if f"Skipping Fandom PNG/JPG (already in images_mismatch): {png_name}" in str(call): + found = True + assert found, "Did not skip Fandom PNG in images_mismatch as expected" diff --git a/tests/test_page_titiles.py b/tests/test_page_titiles.py new file mode 100644 index 00000000..2ff2a136 --- /dev/null +++ b/tests/test_page_titiles.py @@ -0,0 +1,22 @@ +from wikiteam3.dumpgenerator.api.page_titles import read_until_end + +def test_read_until_end(): + data = [ + "a", + "b", + "c", + "d", + "e", + ] + data_with_end = [i+"\n" for i in data] + data_with_end.append("--END--") + assert list(read_until_end(data_with_end)) == data + assert list(read_until_end(data_with_end, start="c")) == ["c", "d", "e"] + assert list(read_until_end(data_with_end, start="x")) == [] + assert list(read_until_end(data_with_end + ["--END--\n"])) == data + ["--END--"] # two end markers + + try: + _ = list(read_until_end([])) == [] + assert False, "Should raise EOFError" + except EOFError: + pass \ No newline at end of file diff --git a/tests/test_wiki_avoid.py b/tests/test_wiki_avoid.py new file mode 100644 index 00000000..cbbb1bb4 --- /dev/null +++ b/tests/test_wiki_avoid.py @@ -0,0 +1,117 @@ +import unittest +from unittest.mock import patch, MagicMock + +import requests +from wikiteam3.dumpgenerator.config import Config +from wikiteam3.utils.wiki_avoid import avoid_robots_disallow + + +class TestAvoidRobotsDisallow(unittest.TestCase): + + @patch('wikiteam3.utils.wiki_avoid.sys.exit') + @patch('wikiteam3.utils.wiki_avoid.requests.get') + @patch('wikiteam3.utils.wiki_avoid.urllib.robotparser.RobotFileParser') + def test_avoid_robots_disallow_allowed(self, mock_robotparser, mock_requests_get, mock_sys_exit): + """Test when robots.txt allows the user agent""" + config = Config() + config.api = "http://example.com/w/api.php" + other = MagicMock() + other.session = requests.Session() + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = "User-agent: *\nAllow: /" + mock_requests_get.return_value = mock_response + + mock_bot = MagicMock() + mock_bot.can_fetch.return_value = True + mock_robotparser.return_value = mock_bot + + avoid_robots_disallow(config, other) + + mock_requests_get.assert_called_once() + mock_bot.parse.assert_called_once() + self.assertEqual(mock_sys_exit.call_count, 0) + + @patch('wikiteam3.utils.wiki_avoid.sys.exit') + @patch('wikiteam3.utils.wiki_avoid.requests.get') + @patch('wikiteam3.utils.wiki_avoid.urllib.robotparser.RobotFileParser') + def test_avoid_robots_disallow_disallowed(self, mock_robotparser, mock_requests_get, mock_sys_exit): + """Test when robots.txt disallows the user agent""" + config = Config() + config.api = "http://example.com/w/api.php" + other = MagicMock() + other.session = requests.Session() + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = "User-agent: wikiteam3\nDisallow: /" + mock_requests_get.return_value = mock_response + + mock_bot = MagicMock() + mock_bot.can_fetch.return_value = False + mock_robotparser.return_value = mock_bot + + avoid_robots_disallow(config, other) + + mock_requests_get.assert_called_once() + mock_bot.parse.assert_called_once() + mock_sys_exit.assert_called_once_with(20) + + @patch('wikiteam3.utils.wiki_avoid.sys.exit') + @patch('wikiteam3.utils.wiki_avoid.requests.get') + @patch('wikiteam3.utils.wiki_avoid.urllib.robotparser.RobotFileParser') + def test_avoid_robots_disallow_error(self, mock_robotparser, mock_requests_get, mock_sys_exit): + """Test when there is an error fetching robots.txt""" + config = Config() + config.api = "http://example.com/w/api.php" + other = MagicMock() + other.session = requests.Session() + + mock_requests_get.side_effect = Exception("Test exception") + + avoid_robots_disallow(config, other) + + mock_requests_get.assert_called_once() + self.assertEqual(mock_robotparser.call_count, 1) + self.assertEqual(mock_sys_exit.call_count, 0) + + @patch('wikiteam3.utils.wiki_avoid.sys.exit') + @patch('wikiteam3.utils.wiki_avoid.requests.get') + @patch('wikiteam3.utils.wiki_avoid.urllib.robotparser.RobotFileParser') + def test_avoid_robots_disallow_robots_not_found(self, mock_robotparser, mock_requests_get, mock_sys_exit): + """Test when robots.txt returns a 404""" + config = Config() + config.api = "http://example.com/w/api.php" + other = MagicMock() + other.session = requests.Session() + + mock_response = MagicMock() + mock_response.status_code = 404 + mock_requests_get.return_value = mock_response + + avoid_robots_disallow(config, other) + + mock_requests_get.assert_called_once() + self.assertEqual(mock_robotparser.call_count, 1) + self.assertEqual(mock_sys_exit.call_count, 0) + + @patch('wikiteam3.utils.wiki_avoid.sys.exit') + @patch('wikiteam3.utils.wiki_avoid.requests.get') + @patch('wikiteam3.utils.wiki_avoid.urllib.robotparser.RobotFileParser') + def test_avoid_robots_disallow_no_api_index(self, mock_robotparser, mock_requests_get, mock_sys_exit): + """Test when both config.api and config.index are None""" + config = Config() + config.api = None + config.index = None + other = MagicMock() + other.session = requests.Session() + + avoid_robots_disallow(config, other) + + self.assertEqual(mock_requests_get.call_count, 0) + self.assertEqual(mock_robotparser.call_count, 1) + self.assertEqual(mock_sys_exit.call_count, 0) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/batchdownload/7z2bz2.sh b/tools/batchdownload/7z2bz2.sh similarity index 97% rename from batchdownload/7z2bz2.sh rename to tools/batchdownload/7z2bz2.sh index 72a752db..093e262f 100644 --- a/batchdownload/7z2bz2.sh +++ b/tools/batchdownload/7z2bz2.sh @@ -8,4 +8,4 @@ DUMP=$(echo $1 | sed 's/.7z//') echo $DUMP 7z e -so $DUMP.7z $DUMP | bzip2 -c > $DUMP.bz2; -rm $1; \ No newline at end of file +rm $1; diff --git a/batchdownload/editthis/editthis.info b/tools/batchdownload/editthis/editthis.info similarity index 100% rename from batchdownload/editthis/editthis.info rename to tools/batchdownload/editthis/editthis.info diff --git a/batchdownload/editthis/editthis.info.all b/tools/batchdownload/editthis/editthis.info.all similarity index 100% rename from batchdownload/editthis/editthis.info.all rename to tools/batchdownload/editthis/editthis.info.all diff --git a/batchdownload/referata/referata.txt b/tools/batchdownload/referata/referata.txt similarity index 100% rename from batchdownload/referata/referata.txt rename to tools/batchdownload/referata/referata.txt diff --git a/batchdownload/referata/referata.txt.all b/tools/batchdownload/referata/referata.txt.all similarity index 100% rename from batchdownload/referata/referata.txt.all rename to tools/batchdownload/referata/referata.txt.all diff --git a/batchdownload/taskforce/mediawikis_done_2014.txt b/tools/batchdownload/taskforce/mediawikis_done_2014.txt similarity index 100% rename from batchdownload/taskforce/mediawikis_done_2014.txt rename to tools/batchdownload/taskforce/mediawikis_done_2014.txt diff --git a/batchdownload/taskforce/mediawikis_notarchived_2018.txt b/tools/batchdownload/taskforce/mediawikis_notarchived_2018.txt similarity index 100% rename from batchdownload/taskforce/mediawikis_notarchived_2018.txt rename to tools/batchdownload/taskforce/mediawikis_notarchived_2018.txt diff --git a/batchdownload/taskforce/mediawikis_pavlo.alive.filtered.todo.txt b/tools/batchdownload/taskforce/mediawikis_pavlo.alive.filtered.todo.txt similarity index 100% rename from batchdownload/taskforce/mediawikis_pavlo.alive.filtered.todo.txt rename to tools/batchdownload/taskforce/mediawikis_pavlo.alive.filtered.todo.txt diff --git a/batchdownload/wikkii/wikkii.txt b/tools/batchdownload/wikkii/wikkii.txt similarity index 100% rename from batchdownload/wikkii/wikkii.txt rename to tools/batchdownload/wikkii/wikkii.txt diff --git a/batchdownload/wikkii/wikkii.txt.all b/tools/batchdownload/wikkii/wikkii.txt.all similarity index 100% rename from batchdownload/wikkii/wikkii.txt.all rename to tools/batchdownload/wikkii/wikkii.txt.all diff --git a/tools/images_size.py b/tools/images_size.py new file mode 100644 index 00000000..933514d1 --- /dev/null +++ b/tools/images_size.py @@ -0,0 +1,19 @@ +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description='Estimate size of images') + parser.add_argument('images_list', help='Path to images.txt') + args = parser.parse_args() + return args + +if __name__ == '__main__': + arg = parse_args() + images_list = arg.images_list + size = 0 + with open(images_list, 'r', encoding='utf-8') as f: + for line in f: + try: + size += int(line.strip().split('\t')[3]) + except Exception: + pass + print(size // 1024 // 1024 // 1024, 'GB') \ No newline at end of file diff --git a/tools/not-archived.py b/tools/not-archived.py new file mode 100644 index 00000000..894a9609 --- /dev/null +++ b/tools/not-archived.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +# not-archived.py List of not archived wikis, using WikiApiary data +# NOTE: unreliable! https://github.com/WikiApiary/WikiApiary/issues/130 +# +# Copyright (C) 2015 WikiTeam developers +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import re +import ssl +import urllib.request + + +def getdomain(wiki): + domain = wiki + domain = domain.split("://")[1].split("/")[0] + domain = re.sub(r"(?im)^www\d*\.", "", domain) + return domain + + +def main(): + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS or ssl.VERIFY_X509_TRUSTED_FIRST) + + doneurl = "https://archive.org/advancedsearch.php?q=collection%3A%28wikiteam%29+AND+originalurl%3A[%22http%22+TO+null]&fl[]=description&sort[]=&sort[]=&sort[]=&rows=100000&page=1&output=json&callback=callback" + f = urllib.request.urlopen(doneurl, context=ssl_context) + wikiurls = re.findall( + r'(?im)]+?)\\" rel=\\"nofollow\\">[^<]+? dumped with', + str(f.read()), + ) + donewikis = [getdomain(wikiurl) for wikiurl in wikiurls] + # print 'Loaded %d done wikis' % len(donewikis) + + offset = 0 + limit = 500 + wikis = [] + while True: + # query does not retrieve wikifarms wikis, fix it? https://wikiapiary.com/wiki/Reiser4_FS_Wiki + url = ( + "https://wikiapiary.com/wiki/Special:Ask/-5B-5BCategory:Website-20not-20archived-5D-5D-20-5B-5BIs-20defunct::False-5D-5D-20-5B-5BIs-20in-20farm::False-5D-5D/-3F%%3DWiki-23/-3FHas-20API-20URL%%3DAPI/-3FHas-20pages-20count%%3DPages/-3FHas-20images-20count%%3DImages/format%%3Dtable/limit%%3D%d/link%%3Dall/sort%%3DHas-20pages-20count,Has-20images-20count/order%%3Dasc/mainlabel%%3DWiki/searchlabel%%3D%%E2%%80%%A6-20further-20results/offset%%3D%d" + % (limit, offset) + ) + f = urllib.request.urlopen(url, context=ssl_context) + raw = f.read() + m = re.findall( + '(?im)([^<>]+?)[^<>]+?[^<>]+?[^<>]+?[^<>]+?', + str(raw), + ) + for i in m: + domain = getdomain(i[1]) + if ( + domain not in donewikis + and not domain.endswith("editthis.info") + and not domain.endswith("wiki-site.com") + ): + print(i[1], i[2], i[3], i[0]) + + if not re.search(r'rel="nofollow">Next', str(raw)): + break + offset += limit + + +if __name__ == "__main__": + main() diff --git a/tools/wikiadownloader.py b/tools/wikiadownloader.py new file mode 100644 index 00000000..164ce0ba --- /dev/null +++ b/tools/wikiadownloader.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2011 WikiTeam +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# using a list of wikia subdomains, it downloads all dumps available in Special:Statistics pages +# you can use the list available at the "listofwikis" directory, the file is called wikia.com and it contains +200k wikis + +""" +instructions: + +it requires a list of wikia wikis +there is one in the repository (listofwikis directory) + +run it: python wikiadownloader.py + +it you want to resume: python wikiadownloader.py wikitostartfrom + +where wikitostartfrom is the last downloaded wiki in the previous session + +""" +import os +import re +import ssl +import sys +import urllib.request +from urllib.error import HTTPError + + +def download(wiki): + f = urllib.request.urlopen( + "%s/wiki/Special:Statistics" % (wiki), context=ssl_context + ) + html = str(f.read()) + f.close() + + m = re.compile( + r'(?i)(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P', + raw2_, + )[0] + y = re.findall(r"(?im)(\d+)", raw2_)[0] + itemfiles.append([int(x), int(y)]) + except: + pass + + itemfiles.sort(reverse=True) + print(itemfiles) + itemdate = ( + str(itemfiles[0][0])[0:4] + + "/" + + str(itemfiles[0][0])[4:6] + + "/" + + str(itemfiles[0][0])[6:8] + ) + itemsize = itemfiles[0][1] + + iaparams = """|Internet Archive identifier={} +|Internet Archive URL={} +|Internet Archive added date={} 00:00:00 +|Internet Archive file size={}""".format( + itemidentifier, + itemurl, + itemdate, + itemsize, + ) + newtext = page.text + newtext = re.sub(r"(?im)\n\}\}", "\n%s\n}}" % (iaparams), newtext) + + if page.text != newtext: + pywikibot.showDiff(page.text, newtext) + page.text = newtext + page.save( + "BOT - Adding dump details: %s, %s, %s bytes" + % (itemidentifier, itemdate, itemsize), + botflag=True, + ) + + +if __name__ == "__main__": + main() diff --git a/wikiapiary/wikiapiary_family.py b/tools/wikiapiary/wikiapiary_family.py similarity index 78% rename from wikiapiary/wikiapiary_family.py rename to tools/wikiapiary/wikiapiary_family.py index daf381f6..85191828 100644 --- a/wikiapiary/wikiapiary_family.py +++ b/tools/wikiapiary/wikiapiary_family.py @@ -1,11 +1,10 @@ -# -*- coding: utf-8 -*- """Family module for WikiApiary wiki.""" -from __future__ import unicode_literals -__version__ = '$Id: 8c9856dd7c0af8d400d0d95b00bf406002729008 $' +__version__ = "$Id: 8c9856dd7c0af8d400d0d95b00bf406002729008 $" from pywikibot import family + # The MediaWiki family # user-config.py: usernames['wikiapiary']['wikiapiary'] = 'User name' class Family(family.WikimediaFamily): @@ -14,11 +13,11 @@ class Family(family.WikimediaFamily): def __init__(self): """Constructor.""" - super(Family, self).__init__() - self.name = 'wikiapiary' + super().__init__() + self.name = "wikiapiary" self.langs = { - 'wikiapiary': 'wikiapiary.com', + "wikiapiary": "wikiapiary.com", } # Wikimedia wikis all use "bodyContent" as the id of the
      @@ -38,24 +37,24 @@ def scriptpath(self, code): uses a different value. """ - return '/w' + return "/w" # Which version of MediaWiki is used? REQUIRED def version(self, code): # Replace with the actual version being run on your wiki - return '1.25.3' + return "1.25.3" def code2encoding(self, code): """Return the encoding for a specific language wiki""" # Most wikis nowadays use UTF-8, but change this if yours uses # a different encoding - return 'utf-8' - + return "utf-8" + def path(self, code): - return '/w/index.php' + return "/w/index.php" def apipath(self, code): - return '/w/api.php' + return "/w/api.php" def protocol(self, code): - return 'HTTPS' + return "HTTPS" diff --git a/tools/wikimediacommons/commons-update-status.py b/tools/wikimediacommons/commons-update-status.py new file mode 100644 index 00000000..80976da5 --- /dev/null +++ b/tools/wikimediacommons/commons-update-status.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2012-2016 WikiTeam developers +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import json +import urllib + + +def main(): + queryurl = "https://archive.org/advancedsearch.php?q=collection%3Awikimediacommons&fl[]=identifier&sort[]=&sort[]=&sort[]=&rows=1000&page=1&output=json&callback=callback" + raw = urllib.request.urlopen(queryurl).read() + raw = raw.split("callback(")[1].strip(")") + result = json.loads(raw)["response"]["docs"] + + identifiers = {} + for item in result: + identifier = item["identifier"] + if "wikimediacommons-20" in identifier: + date = identifier.split("wikimediacommons-")[1] + t = date.split("-") + if len(t) == 1: + if len(t[0]) == 4: # YYYY + identifiers[t[0]] = identifier + elif len(t[0]) == 6: # YYYYMM + identifiers[f"{t[0][:4]}-{t[0][4:6]}"] = identifier + elif len(t[0]) == 8: # YYYYMMDD + identifiers[f"{t[0][:4]}-{t[0][4:6]}-{t[0][6:8]}"] = identifier + else: + print("ERROR, dont understand date format in %s" % (identifier)) + elif len(t) == 2: + if len(t[0]) == 4 and len(t[1]) == 2: # YYYY-MM + identifiers[f"{t[0]}-{t[1]}"] = identifier + else: + print("ERROR, dont understand date format in %s" % (identifier)) + elif len(t) == 3: + if len(t[0]) == 4 and len(t[1]) == 2 and len(t[2]) == 2: # YYYY-MM-DD + identifiers[f"{t[0]}-{t[1]}-{t[2]}"] = identifier + else: + print("ERROR, dont understand date format in %s" % (identifier)) + + identifiers_list = [[k, v] for k, v in identifiers.items()] + identifiers_list.sort() + + rows = [ + f"|-\n| {k} || [https://archive.org/details/{v} {v}] || ??? || ???" + for k, v in identifiers_list + ] + output = """ +{| class="wikitable sortable" +! Date !! Identifier !! Files !! Size (GB) +%s +|}""" % ( + "\n".join(rows) + ) + print(output) + + +if __name__ == "__main__": + main() diff --git a/tools/wikimediacommons/commonschecker.py b/tools/wikimediacommons/commonschecker.py new file mode 100644 index 00000000..18f48332 --- /dev/null +++ b/tools/wikimediacommons/commonschecker.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# Copyright (C) 2011-2012 WikiTeam +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import csv +import datetime +import os +import re +import sys +import zipfile +from hashlib import md5 + + +def welcome(): + """""" + print("#" * 73) + print("# Welcome to CommonsChecker 0.1 by WikiTeam (GPL v3) #") + print("# More info at: http://code.google.com/p/wikiteam/ #") + print("#" * 73) + print("") + print("#" * 73) + print("# Copyright (C) 2011-2012 WikiTeam #") + print("# This program is free software: you can redistribute it and/or modify #") + print("# it under the terms of the GNU General Public License as published by #") + print("# the Free Software Foundation, either version 3 of the License, or #") + print("# (at your option) any later version. #") + print("# #") + print("# This program is distributed in the hope that it will be useful, #") + print("# but WITHOUT ANY WARRANTY; without even the implied warranty of #") + print("# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #") + print("# GNU General Public License for more details. #") + print("# #") + print("# You should have received a copy of the GNU General Public License #") + print("# along with this program. If not, see . #") + print("#" * 73) + print("") + + +def main(): + welcome() + + startdate = "" + enddate = "" + delta = datetime.timedelta(days=1) # chunks by day + if len(sys.argv) == 1: + print( + "Usage example: python script.py 2005-01-01 2005-01-10 [to check the first 10 days of 2005]" + ) + sys.exit() + elif len(sys.argv) == 2: # use sys.argv[1] as start and enddata, just check a day + startdate = datetime.datetime.strptime(sys.argv[1], "%Y-%m-%d") + enddate = datetime.datetime.strptime(sys.argv[1], "%Y-%m-%d") + elif len(sys.argv) == 3: + startdate = datetime.datetime.strptime(sys.argv[1], "%Y-%m-%d") + enddate = datetime.datetime.strptime(sys.argv[2], "%Y-%m-%d") + else: + sys.exit() + + print( + "Checking Wikimedia Commons files from %s to %s" + % (startdate.strftime("%Y-%m-%d"), enddate.strftime("%Y-%m-%d")) + ) + while startdate <= enddate: + print("== %s ==" % (startdate.strftime("%Y-%m-%d"))) + filenamecsv = startdate.strftime("%Y-%m-%d.csv") + filenamezip = startdate.strftime("%Y-%m-%d.zip") + if os.path.exists(filenamecsv): + f = csv.reader( + open(filenamecsv), + delimiter="|", + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + ) + if os.path.exists(filenamezip): + zipfiles = zipfile.ZipFile(filenamezip, "r").infolist() + errors = [] + files_in_zip = [] + csv_data_dict = {} + csv_file_list = [] + files = {} + for ( + img_name, + img_saved_as, + img_timestamp, + img_user, + img_user_text, + img_size, + img_width, + img_height, + ) in f: + csv_data_dict[ + str( + "{}/{}".format( + startdate.strftime("%Y/%m/%d"), img_saved_as + ), + "utf-8", + ) + ] = { + "img_name": img_name, + "img_saved_as": img_saved_as, + "img_timestamp": img_timestamp, + "img_user": img_user, + "img_user_text": img_user_text, + "img_size": img_size, + "img_width": img_width, + "img_height": img_height, + } + csv_file_list.append( + str( + "{}/{}".format( + startdate.strftime("%Y/%m/%d"), img_saved_as + ), + "utf-8", + ) + ) + for i in zipfiles: + files_in_zip.append(i.filename) + files[i.filename] = i + combined = list(set(files_in_zip) & set(csv_file_list)) + for name in set(combined): + csv_img = csv_data_dict[name] + if csv_img["img_timestamp"].startswith( + startdate.strftime("%Y%m%d") + ): + # check img_saved_as existence in zip and check size + # img_saved_as = unicode(img_saved_as, 'utf-8') + ok = False + error = "missing" + i = files[name] + if str(i.file_size) == csv_img["img_size"]: + ok = True + elif i.file_size == 0: + error = "empty" + else: + error = "corrupt ({} of {} bytes)".format( + i.file_size, + csv_img["img_size"], + ) + if not ok: + print(csv_img["img_name"], csv_img["img_saved_as"], error) + errors.append([csv_img["img_saved_as"], error]) + if errors: + print("This .zip contains errors:") + print( + "\n".join( + [ + f' -> "{filename}" is {error}' + for filename, error in errors + ] + ) + ) + else: + print("No errors found") + else: + print("Error, no %s available" % (filenamezip)) + startdate += delta + + +if __name__ == "__main__": + main() diff --git a/tools/wikimediacommons/commonsdownloader.py b/tools/wikimediacommons/commonsdownloader.py new file mode 100644 index 00000000..7664249c --- /dev/null +++ b/tools/wikimediacommons/commonsdownloader.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2011-2016 WikiTeam developers +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import csv +import datetime +import os +import re +import sys +import urllib +from hashlib import md5 + + +def welcome(): + """""" + print("#" * 73) + print("# Welcome to CommonsDownloader 0.1 by WikiTeam (GPL v3)") + print("More info: https://github.com/WikiTeam/wikiteam") + print("#" * 73) + print("") + print("#" * 73) + print("# Copyright (C) 2011-2016 WikiTeam") + print("# This program is free software: you can redistribute it and/or modify #") + print("# it under the terms of the GNU General Public License as published by #") + print("# the Free Software Foundation, either version 3 of the License, or #") + print("# (at your option) any later version. #") + print("# #") + print("# This program is distributed in the hope that it will be useful, #") + print("# but WITHOUT ANY WARRANTY; without even the implied warranty of #") + print("# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #") + print("# GNU General Public License for more details. #") + print("# #") + print("# You should have received a copy of the GNU General Public License #") + print("# along with this program. If not, see . #") + print("#" * 73) + print("") + + +def bye(): + """""" + print("---> Congratulations! Your dump is complete <---") + print( + "If you found any bug, report a new issue here (Google account required): http://code.google.com/p/wikiteam/issues/list" + ) + print( + "If this is a public wiki, please, consider sending us a copy of this dump. Contact us at http://code.google.com/p/wikiteam" + ) + print("Good luck! Bye!") + + +def main(): + welcome() + + filenamefeed = "commonssql.csv" # feed + startdate = "" + enddate = "" + delta = datetime.timedelta(days=1) # chunks by day + filenamelimit = 100 # do not change!!! + if len(sys.argv) == 1: + print( + "Usage example: python script.py 2005-01-01 2005-01-10 [to download the first 10 days of 2005]" + ) + sys.exit() + elif ( + len(sys.argv) == 2 + ): # use sys.argv[1] as start and enddata, just download a day + startdate = datetime.datetime.strptime(sys.argv[1], "%Y-%m-%d") + enddate = datetime.datetime.strptime(sys.argv[1], "%Y-%m-%d") + elif len(sys.argv) == 3: + startdate = datetime.datetime.strptime(sys.argv[1], "%Y-%m-%d") + enddate = datetime.datetime.strptime(sys.argv[2], "%Y-%m-%d") + else: + sys.exit() + + print( + "Downloading Wikimedia Commons files from %s to %s" + % (startdate.strftime("%Y-%m-%d"), enddate.strftime("%Y-%m-%d")) + ) + while startdate <= enddate: + print("== %s ==" % (startdate.strftime("%Y-%m-%d"))) + savepath = startdate.strftime("%Y/%m/%d") + filenamecsv = startdate.strftime("%Y-%m-%d.csv") + filenamezip = startdate.strftime("%Y-%m-%d.zip") + c = 0 + f = csv.reader( + open(filenamefeed), + delimiter="|", + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + ) + for ( + img_name, + img_timestamp, + img_user, + img_user_text, + img_size, + img_width, + img_height, + ) in f: + if img_timestamp.startswith(startdate.strftime("%Y%m%d")): + if not c: # first loop + try: # create savepath if not exists + os.makedirs(savepath) + except: + pass + # csv header + h = open(filenamecsv, "w") + h.write( + "img_name|img_saved_as|img_timestamp|img_user|img_user_text|img_size|img_width|img_height\n" + ) + h.close() + + img_name = str(img_name, "utf-8") + img_user_text = str(img_user_text, "utf-8") + original_name = img_name + if re.search( + r"(?m)^\d{14}\!", original_name + ): # removing 20101005024534! (or similar) from name if present + original_name = original_name[15:] + # quote weird chars to avoid errors while wgetting + img_name_quoted = urllib.parse.quote(re.sub(r" ", r"_", str(img_name))) + # _ ending variables contains no spaces, and \" for command line + img_name_ = re.sub( + r'"', r"\"", re.sub(r" ", r"_", str(img_name)) + ) # do not use r'', it is encoded + original_name_ = re.sub( + r'"', r"\"", re.sub(r" ", r"_", str(original_name)) + ) # do not use r'', it is encoded + md5hash = md5( + re.sub(" ", "_", original_name.encode("utf-8")) + ).hexdigest() # do not use image_name, md5 needs the original name and without \" + img_saved_as = "" + img_saved_as_ = "" + if len(img_name) > filenamelimit: # truncate filename if it is long + img_saved_as = ( + img_name[:filenamelimit] + + md5(re.sub(" ", "_", str(img_name))).hexdigest() + + "." + + img_name.split(".")[-1] + ) + img_saved_as = re.sub( + r" ", r"_", img_saved_as + ) # do not use r'', it is encoded + img_saved_as_ = re.sub( + r'"', r"\"", re.sub(r" ", r"_", img_saved_as.encode("utf-8")) + ) # do not use r'', it is encoded + else: + img_saved_as = re.sub( + r" ", r"_", img_name + ) # do not use r'', it is encoded + img_saved_as_ = re.sub( + r'"', r"\"", re.sub(r" ", r"_", img_name.encode("utf-8")) + ) # do not use r'', it is encoded + print(img_name, img_saved_as, img_timestamp) + + # wget file + if ( + original_name != img_name + ): # the image is an old version, download using /archive/ path in server + os.system( + 'wget -c "https://upload.wikimedia.org/wikipedia/commons/archive/%s/%s/%s" -O "%s/%s"' + % ( + md5hash[0], + md5hash[0:2], + img_name_quoted, + savepath, + img_saved_as_, + ) + ) + try: + if not os.path.getsize( + f"{savepath}/{img_saved_as_}" + ): # empty file?... + # probably false 20101005024534! begining like this http://commons.wikimedia.org/wiki/File:20041028210012!Pilar.jpg + # ok, restore original_name to ! version and recalculate md5 and other variables that use original_name as source + original_name = img_name + original_name_ = re.sub( + r'"', + r"\"", + re.sub(r" ", r"_", original_name.encode("utf-8")), + ) + md5hash = md5( + re.sub(" ", "_", original_name.encode("utf-8")) + ).hexdigest() + # redownload, now without /archive/ subpath + os.system( + 'wget -c "https://upload.wikimedia.org/wikipedia/commons/%s/%s/%s" -O "%s/%s"' + % ( + md5hash[0], + md5hash[0:2], + img_name_quoted, + savepath, + img_saved_as_, + ) + ) + except OSError: + pass + else: + # Issue #66 : try your.org first + os.system( + 'wget -c "http://ftpmirror.your.org/pub/wikimedia/images/wikipedia/commons/%s/%s/%s" -O "%s/%s"' + % ( + md5hash[0], + md5hash[0:2], + img_name_quoted, + savepath, + img_saved_as_, + ) + ) + os.system( + 'wget -c "https://upload.wikimedia.org/wikipedia/commons/%s/%s/%s" -O "%s/%s"' + % ( + md5hash[0], + md5hash[0:2], + img_name_quoted, + savepath, + img_saved_as_, + ) + ) + + # curl .xml description page with full history + os.system( + 'curl -d "&pages=File:%s&history=1&action=submit" https://commons.wikimedia.org/w/index.php?title=Special:Export -o "%s/%s.xml"' + % (original_name_, savepath, img_saved_as_) + ) + + # save csv info + g = csv.writer( + open(filenamecsv, "a"), + delimiter="|", + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + ) + g.writerow( + [ + img_name.encode("utf-8"), + img_saved_as.encode("utf-8"), + img_timestamp, + img_user, + img_user_text.encode("utf-8"), + img_size, + img_width, + img_height, + ] + ) + c += 1 + # zip downloaded files; add mT to the options if you want to save space by removing the downloaded files day by day; commonschecker needs only zip and csv + os.system(f"zip -9r {filenamezip} {savepath}/") + startdate += delta + bye() + + +if __name__ == "__main__": + main() diff --git a/wikimediacommons/commonssql.py b/tools/wikimediacommons/commonssql.py similarity index 53% rename from wikimediacommons/commonssql.py rename to tools/wikimediacommons/commonssql.py index 830a7095..6393c341 100644 --- a/wikimediacommons/commonssql.py +++ b/tools/wikimediacommons/commonssql.py @@ -1,42 +1,54 @@ -#!/usr/bin/env python2 -# -*- coding: utf8 -*- +#!/usr/bin/env python3 # Copyright (C) 2012-2016 WikiTeam developers # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program. If not, see . import csv -import MySQLdb import re import sys +import pymysql + + def main(): year = int(sys.argv[1]) - filename = 'commonssql-%s.csv' % (year) - f = open(filename, 'w') - f.write('img_name|img_timestamp|img_user|img_user_text|img_size|img_width|img_height\n') + filename = "commonssql-%s.csv" % (year) + f = open(filename, "w") + f.write( + "img_name|img_timestamp|img_user|img_user_text|img_size|img_width|img_height\n" + ) f.close() - #http://www.mediawiki.org/wiki/Manual:Image_table - #http://www.mediawiki.org/wiki/Manual:Oldimage_table - + # http://www.mediawiki.org/wiki/Manual:Image_table + # http://www.mediawiki.org/wiki/Manual:Oldimage_table + queries = [ - "SELECT /* commonssql.py SLOW_OK */ img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height FROM image WHERE img_timestamp>=%d0101000000 AND img_timestamp<=%d1231235959 ORDER BY img_timestamp ASC" % (year, year), - "SELECT /* commonssql.py SLOW_OK */ oi_archive_name AS img_name, oi_timestamp AS img_timestamp, oi_user AS img_user, oi_user_text AS img_user_text, oi_size AS img_size, oi_width AS img_width, oi_height AS img_height FROM oldimage WHERE oi_deleted=0 AND oi_timestamp>=%d0101000000 AND oi_timestamp<=%d1231235959 ORDER BY oi_timestamp ASC" % (year, year), #do not get unavailable images + "SELECT /* commonssql.py SLOW_OK */ img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height FROM image WHERE img_timestamp>=%d0101000000 AND img_timestamp<=%d1231235959 ORDER BY img_timestamp ASC" + % (year, year), + "SELECT /* commonssql.py SLOW_OK */ oi_archive_name AS img_name, oi_timestamp AS img_timestamp, oi_user AS img_user, oi_user_text AS img_user_text, oi_size AS img_size, oi_width AS img_width, oi_height AS img_height FROM oldimage WHERE oi_deleted=0 AND oi_timestamp>=%d0101000000 AND oi_timestamp<=%d1231235959 ORDER BY oi_timestamp ASC" + % (year, year), # do not get unavailable images ] - f = csv.writer(open(filename, 'a'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) - conn = MySQLdb.connect(host='s4.labsdb', db='commonswiki_p', read_default_file='~/replica.my.cnf', use_unicode=True) + f = csv.writer( + open(filename, "a"), delimiter="|", quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + conn = pymysql.connect( + host="s4.labsdb", + db="commonswiki_p", + read_default_file="~/replica.my.cnf", + use_unicode=True, + ) for query in queries: conn.query(query) r = conn.store_result() @@ -45,15 +57,25 @@ def main(): rows = [] while row: if len(row) == 1: - img_name = re.sub(' ', '_', row[0]['img_name']) - img_timestamp = row[0]['img_timestamp'] - img_user = row[0]['img_user'] - img_user_text = re.sub(' ', '_', row[0]['img_user_text']) - img_size = row[0]['img_size'] - img_width = row[0]['img_width'] - img_height = row[0]['img_height'] - - rows.append([img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height]) + img_name = re.sub(" ", "_", row[0]["img_name"]) + img_timestamp = row[0]["img_timestamp"] + img_user = row[0]["img_user"] + img_user_text = re.sub(" ", "_", row[0]["img_user_text"]) + img_size = row[0]["img_size"] + img_width = row[0]["img_width"] + img_height = row[0]["img_height"] + + rows.append( + [ + img_name, + img_timestamp, + img_user, + img_user_text, + img_size, + img_width, + img_height, + ] + ) c += 1 if c % 10000 == 0: print(c) @@ -62,5 +84,6 @@ def main(): row = r.fetch_row(maxrows=1, how=1) f.writerows(rows) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/wikimediacommons/commonssql.sh b/tools/wikimediacommons/commonssql.sh similarity index 100% rename from wikimediacommons/commonssql.sh rename to tools/wikimediacommons/commonssql.sh diff --git a/wikimediacommons/commonssql2.sh b/tools/wikimediacommons/commonssql2.sh similarity index 100% rename from wikimediacommons/commonssql2.sh rename to tools/wikimediacommons/commonssql2.sh diff --git a/tools/wikipediadownloader.py b/tools/wikipediadownloader.py new file mode 100644 index 00000000..1c9b87bb --- /dev/null +++ b/tools/wikipediadownloader.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2011-2014 WikiTeam +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import argparse +import os +import re +import sys +import time +import urllib + + +def main(): + parser = argparse.ArgumentParser(description="Downloader of Wikimedia dumps") + # parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False) + parser.add_argument( + "-r", + "--maxretries", + help="Max retries to download a dump when md5sum doesn't fit. Default: 3", + required=False, + ) + parser.add_argument( + "-s", + "--start", + help="Start to download from this project (e.g.: eswiki, itwikisource, etc)", + required=False, + ) + args = parser.parse_args() + + maxretries = 3 + if args.maxretries and int(args.maxretries) >= 0: + maxretries = int(args.maxretries) + + dumpsdomain = "http://dumps.wikimedia.org" + f = urllib.request.urlopen("%s/backup-index.html" % (dumpsdomain)) + raw = f.read() + f.close() + + m = re.compile( + r'[^<]+: Dump complete' + ).finditer(raw) + projects = [] + for i in m: + projects.append([i.group("project"), i.group("date")]) + projects.reverse() # download oldest dumps first + # projects = [['enwiki', '20130805']] + + start = args.start + for project, date in projects: + if start: + if start != project: + print(f"Skipping {project}, {date}") + continue + else: + start = "" # reset + + print("-" * 50, "\n", "Checking", project, date, "\n", "-" * 50) + time.sleep(1) # ctrl-c + f = urllib.request.urlopen(f"{dumpsdomain}/{project}/{date}/") + htmlproj = f.read() + # print (htmlproj) + f.close() + + for dumpclass in [r"pages-meta-history\d*\.xml[^\.]*\.7z"]: + corrupted = True + maxretries2 = maxretries + while corrupted and maxretries2 > 0: + maxretries2 -= 1 + m = re.compile( + r'' + % (project, date, project, date, dumpclass) + ).finditer(htmlproj) + urldumps = [] + # enwiki is splitted in several files, thats why we need a loop + # here + for i in m: + urldumps.append("{}/{}".format(dumpsdomain, i.group("urldump"))) + + # print (urldumps) + for urldump in urldumps: + dumpfilename = urldump.split("/")[-1] + path = f"{dumpfilename[0]}/{project}" + if not os.path.exists(path): + os.makedirs(path) + os.system(f"wget -c {urldump} -O {path}/{dumpfilename}") + + # md5check + os.system(f"md5sum {path}/{dumpfilename} > md5") + f = open("md5") + raw = f.read() + f.close() + md51 = re.findall( + rf"(?P[a-f0-9]{{32}})\s+{path}/{dumpfilename}", raw + )[0] + print(md51) + + f = urllib.request.urlopen( + "%s/%s/%s/%s-%s-md5sums.txt" + % (dumpsdomain, project, date, project, date) + ) + raw = f.read() + f.close() + f = open(f"{path}/{project}-{date}-md5sums.txt", "w") + f.write(raw) + f.close() + md52 = re.findall( + r"(?P[a-f0-9]{32})\s+%s" % (dumpfilename), raw + )[0] + print(md52) + + if md51 == md52: + print(r"md5sum is correct for this file, horay! \o/") + print("\n" * 3) + corrupted = False + else: + os.remove(f"{path}/{dumpfilename}") + + +if __name__ == "__main__": + main() diff --git a/tools/wikispaces.py b/tools/wikispaces.py new file mode 100644 index 00000000..404d877b --- /dev/null +++ b/tools/wikispaces.py @@ -0,0 +1,614 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2018 WikiTeam developers +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki +# Documentation for developers: http://wikiteam.readthedocs.com +""" +# You need a file with access and secret keys, in two different lines +iakeysfilename = '%s/.iakeys' % (os.path.expanduser('~')) +if os.path.exists(iakeysfilename): + accesskey = open(iakeysfilename, 'r').readlines()[0].strip() + secretkey = open(iakeysfilename, 'r').readlines()[1].strip() +else: + print('Error, no %s file with S3 keys for Internet Archive account' % (iakeysfilename)) + sys.exit() +""" +import csv +import datetime +import os +import random +import re +import subprocess +import sys +import time +import urllib.request + +# from internetarchive import get_item + +# Requirements: +# zip command (apt-get install zip) +# ia command (pip install internetarchive, and configured properly) + + +def saveURL(wikidomain="", url="", filename="", path="", overwrite=False, iteration=1): + filename2 = f"{wikidomain}/{filename}" + if path: + filename2 = f"{wikidomain}/{path}/{filename}" + if os.path.exists(filename2): + if not overwrite: + print( + "Warning: file exists on disk. Skipping download. Force download with parameter --overwrite" + ) + return + opener = urllib.request.build_opener() + opener.addheaders = [("User-agent", "Mozilla/5.0")] + urllib.request.install_opener(opener) + try: + urllib.request.urlretrieve(url, filename2) + except: + sleep = 10 # seconds + maxsleep = 30 + while sleep <= maxsleep: + try: + print("Error while retrieving: %s" % (url)) + print("Retry in %s seconds..." % (sleep)) + time.sleep(sleep) + urllib.request.urlretrieve(url, filename2) + return + except: + sleep = sleep * 2 + print("Download failed") + + # sometimes wikispaces returns invalid data, redownload in that cases + # only 'pages'. 'files' binaries are a pain to open and check + if (os.path.exists(filename2) and "pages" in path) or ( + os.path.exists(filename2) + and path == "" + and filename2.split(".")[-1] in ["xml", "html", "csv"] + ): + sleep2 = 60 * iteration + raw = "" + try: + with open(filename2, encoding="utf-8") as f: + raw = f.read() + except: + with open(filename2, encoding="latin-1") as f: + raw = f.read() + if re.findall(r"(?im)TES and THE Status", raw): + print( + "Warning: invalid content. Waiting %d seconds and re-downloading" + % (sleep2) + ) + time.sleep(sleep2) + saveURL( + wikidomain=wikidomain, + url=url, + filename=filename, + path=path, + overwrite=overwrite, + iteration=iteration + 1, + ) + + +def undoHTMLEntities(text=""): + """Undo some HTML codes""" + + # i guess only < > & " ' need conversion + # http://www.w3schools.com/html/html_entities.asp + text = re.sub("<", "<", text) + text = re.sub(">", ">", text) + text = re.sub("&", "&", text) + text = re.sub(""", '"', text) + text = re.sub("'", "'", text) + + return text + + +def convertHTML2Wikitext(wikidomain="", filename="", path=""): + wikitext = "" + wikitextfile = f"{wikidomain}/{path}/{filename}" + if not os.path.exists(wikitextfile): + print("Error retrieving wikitext, page is a redirect probably") + return + with open(wikitextfile) as f: + wikitext = f.read() + with open(wikitextfile, "w") as f: + m = re.findall( + r'(?im)
      \s*
      ', wikitext
      +        )
      +        if m:
      +            try:
      +                wikitext = wikitext.split(m[0])[1].split("
      ")[0].strip() + wikitext = undoHTMLEntities(text=wikitext) + except: + pass + f.write(wikitext) + + +def downloadPage(wikidomain="", wikiurl="", pagename="", overwrite=False): + pagenameplus = re.sub(" ", "+", pagename) + pagename_ = urllib.parse.quote(pagename) + + # page current revision (html & wikitext) + pageurl = f"{wikiurl}/{pagename_}" + filename = "%s.html" % (pagenameplus) + print("Downloading page: %s" % (filename)) + saveURL( + wikidomain=wikidomain, + url=pageurl, + filename=filename, + path="pages", + overwrite=overwrite, + ) + pageurl2 = f"{wikiurl}/page/code/{pagename_}" + filename2 = "%s.wikitext" % (pagenameplus) + print("Downloading page: %s" % (filename2)) + saveURL( + wikidomain=wikidomain, + url=pageurl2, + filename=filename2, + path="pages", + overwrite=overwrite, + ) + convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path="pages") + + # csv with page history + csvurl = "{}/page/history/{}?utable=WikiTablePageHistoryList&ut_csv=1".format( + wikiurl, + pagename_, + ) + csvfilename = "%s.history.csv" % (pagenameplus) + print("Downloading page: %s" % (csvfilename)) + saveURL( + wikidomain=wikidomain, + url=csvurl, + filename=csvfilename, + path="pages", + overwrite=overwrite, + ) + + +def downloadFile(wikidomain="", wikiurl="", filename="", overwrite=False): + filenameplus = re.sub(" ", "+", filename) + filename_ = urllib.parse.quote(filename) + + # file full resolution + fileurl = f"{wikiurl}/file/view/{filename_}" + filename = filenameplus + print("Downloading file: %s" % (filename)) + saveURL( + wikidomain=wikidomain, + url=fileurl, + filename=filename, + path="files", + overwrite=overwrite, + ) + + # csv with file history + csvurl = "{}/file/detail/{}?utable=WikiTablePageList&ut_csv=1".format( + wikiurl, + filename_, + ) + csvfilename = "%s.history.csv" % (filenameplus) + print("Downloading file: %s" % (csvfilename)) + saveURL( + wikidomain=wikidomain, + url=csvurl, + filename=csvfilename, + path="files", + overwrite=overwrite, + ) + + +def downloadPagesAndFiles(wikidomain="", wikiurl="", overwrite=False): + print("Downloading Pages and Files from %s" % (wikiurl)) + # csv all pages and files + csvurl = "%s/space/content?utable=WikiTablePageList&ut_csv=1" % (wikiurl) + saveURL(wikidomain=wikidomain, url=csvurl, filename="pages-and-files.csv", path="") + # download every page and file + totallines = 0 + with open("%s/pages-and-files.csv" % (wikidomain)) as f: + totallines = len(f.read().splitlines()) - 1 + with open("%s/pages-and-files.csv" % (wikidomain)) as csvfile: + filesc = 0 + pagesc = 0 + print("This wiki has %d pages and files" % (totallines)) + rows = csv.reader(csvfile, delimiter=",", quotechar='"') + for row in rows: + if row[0] == "file": + filesc += 1 + filename = row[1] + downloadFile( + wikidomain=wikidomain, + wikiurl=wikiurl, + filename=filename, + overwrite=overwrite, + ) + elif row[0] == "page": + pagesc += 1 + pagename = row[1] + downloadPage( + wikidomain=wikidomain, + wikiurl=wikiurl, + pagename=pagename, + overwrite=overwrite, + ) + if (filesc + pagesc) % 10 == 0: + print(" Progress: %d of %d" % ((filesc + pagesc), totallines)) + print(" Progress: %d of %d" % ((filesc + pagesc), totallines)) + print("Downloaded %d pages" % (pagesc)) + print("Downloaded %d files" % (filesc)) + + +def downloadSitemap(wikidomain="", wikiurl="", overwrite=False): + print("Downloading sitemap.xml") + saveURL( + wikidomain=wikidomain, + url=wikiurl, + filename="sitemap.xml", + path="", + overwrite=overwrite, + ) + + +def downloadMainPage(wikidomain="", wikiurl="", overwrite=False): + print("Downloading index.html") + saveURL( + wikidomain=wikidomain, + url=wikiurl, + filename="index.html", + path="", + overwrite=overwrite, + ) + + +def downloadLogo(wikidomain="", wikiurl="", overwrite=False): + index = "%s/index.html" % (wikidomain) + if os.path.exists(index): + raw = "" + try: + with open(index, encoding="utf-8") as f: + raw = f.read() + except: + with open(index, encoding="latin-1") as f: + raw = f.read() + m = re.findall(r'class="WikiLogo WikiElement"> 2: + if "--upload" in sys.argv: + upload = True + if "--admin" in sys.argv: + isadmin = True + if "--overwrite" in sys.argv: + overwrite = True + if "--overwrite-ia" in sys.argv: + overwriteia = True + if "--help" in sys.argv: + printhelp() + + wikilist = [] + if "://" in param: + wikilist.append(param.rstrip("/")) + elif param.lower() == "duckduckgo": + wikilist = duckduckgo() + # for wiki in wikilist: + # print(wiki) + else: + with open(param) as f: + wikilist = f.read().strip().splitlines() + wikilist2 = [] + for wiki in wikilist: + wikilist2.append(wiki.rstrip("/")) + wikilist = wikilist2 + + for wikiurl in wikilist: + wikidomain = wikiurl.split("://")[1].split("/")[0] + print("\n") + print("#" * 40, "\n Downloading:", wikiurl) + print("#" * 40, "\n") + + if upload and not overwriteia: + itemid = "wiki-%s" % (wikidomain) + try: + iahtml = "" + try: + iahtml = ( + urllib.request.urlopen( + "https://archive.org/details/%s" % (itemid) + ) + .read() + .decode("utf-8") + ) + except: + time.sleep(10) + iahtml = ( + urllib.request.urlopen( + "https://archive.org/details/%s" % (itemid) + ) + .read() + .decode("utf-8") + ) + if iahtml and not re.findall(r"(?im)Item cannot be found", iahtml): + if not overwriteia: + print( + "Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia" + ) + print( + "You can find it in https://archive.org/details/%s" + % (itemid) + ) + time.sleep(1) + continue + except: + pass + + dirfiles = "%s/files" % (wikidomain) + if not os.path.exists(dirfiles): + print("Creating directory %s" % (dirfiles)) + os.makedirs(dirfiles) + dirpages = "%s/pages" % (wikidomain) + if not os.path.exists(dirpages): + print("Creating directory %s" % (dirpages)) + os.makedirs(dirpages) + sitemapurl = "https://%s/sitemap.xml" % (wikidomain) + + downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite) + if not os.path.exists("%s/sitemap.xml" % (wikidomain)): + print("Error, wiki was probably deleted. Skiping wiki...") + continue + else: + sitemapraw = "" + try: + with open("%s/sitemap.xml" % (wikidomain), encoding="utf-8") as g: + sitemapraw = g.read() + except: + with open("%s/sitemap.xml" % (wikidomain), encoding="latin-1") as g: + sitemapraw = g.read() + if re.search(r"(?im)

      This wiki has been deactivated

      ", sitemapraw): + print("Error, wiki was deactivated. Skiping wiki...") + continue + + downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite) + if not os.path.exists("%s/index.html" % (wikidomain)): + print("Error, wiki was probably deleted or expired. Skiping wiki...") + continue + else: + indexraw = "" + try: + with open("%s/index.html" % (wikidomain), encoding="utf-8") as g: + indexraw = g.read() + except: + with open("%s/index.html" % (wikidomain), encoding="latin-1") as g: + indexraw = g.read() + if re.search(r"(?im)

      Subscription Expired

      ", indexraw): + print("Error, wiki subscription expired. Skiping wiki...") + continue + + downloadPagesAndFiles( + wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite + ) + logofilename = downloadLogo( + wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite + ) + + if upload: + itemid = "wiki-%s" % (wikidomain) + print("\nCompressing dump...") + wikidir = wikidomain + os.chdir(wikidir) + print("Changed directory to", os.getcwd()) + wikizip = "%s.zip" % (wikidomain) + subprocess.call( + "zip" + + " -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s" + % (wikizip, logofilename), + shell=True, + ) + os.chdir("..") + print("Changed directory to", os.getcwd()) + + print("\nUploading to Internet Archive...") + indexfilename = "%s/index.html" % (wikidir) + if not os.path.exists(indexfilename): + print("\nError dump incomplete, skipping upload\n") + continue + indexhtml = "" + try: + with open(indexfilename, encoding="utf-8") as f: + indexhtml = f.read() + except: + with open(indexfilename, encoding="latin-1") as f: + indexhtml = f.read() + + wikititle = "" + try: + wikititle = ( + indexhtml.split("wiki: {")[1] + .split("}")[0] + .split("text: '")[1] + .split("',")[0] + .strip() + ) + except: + wikititle = wikidomain + if not wikititle: + wikititle = wikidomain + wikititle = wikititle.replace("\\'", " ") + wikititle = wikititle.replace('\\"', " ") + itemtitle = "Wiki - %s" % wikititle + itemdesc = ( + '
      %s dumped with WikiTeam tools.' + % (wikiurl, wikititle) + ) + itemtags = [ + "wiki", + "wikiteam", + "wikispaces", + wikititle, + wikidomain.split(".wikispaces.com")[0], + wikidomain, + ] + itemoriginalurl = wikiurl + itemlicenseurl = "" + m = "" + try: + m = re.findall( + r'', + indexhtml.split('
      ")[0], + ) + except: + m = "" + if m: + itemlicenseurl = m[0] + if not itemlicenseurl: + itemtags.append("unknowncopyright") + itemtags_ = " ".join( + ["--metadata='subject:%s'" % (tag) for tag in itemtags] + ) + itemcollection = isadmin and "wikiteam" or "opensource" + itemlang = "Unknown" + itemdate = datetime.datetime.now().strftime("%Y-%m-%d") + itemlogo = logofilename and f"{wikidir}/{logofilename}" or "" + callplain = "ia upload {} {} {} --metadata='mediatype:web' --metadata='collection:{}' --metadata='title:{}' --metadata='description:{}' --metadata='language:{}' --metadata='last-updated-date:{}' --metadata='originalurl:{}' {} {}".format( + itemid, + wikizip, + itemlogo and itemlogo or "", + itemcollection, + itemtitle, + itemdesc, + itemlang, + itemdate, + itemoriginalurl, + itemlicenseurl + and "--metadata='licenseurl:%s'" % (itemlicenseurl) + or "", + itemtags_, + ) + print(callplain) + subprocess.call(callplain, shell=True) + + """ + md = { + 'mediatype': 'web', + 'collection': itemcollection, + 'title': itemtitle, + 'description': itemdesc, + 'language': itemlang, + 'last-updated-date': itemdate, + 'subject': '; '.join(itemtags), + 'licenseurl': itemlicenseurl, + 'originalurl': itemoriginalurl, + } + item = get_item(itemid) + item.upload(wikizip, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False) + item.modify_metadata(md) + if itemlogo: + item.upload(itemlogo, access_key=accesskey, secret_key=secretkey, verbose=True) + """ + + print("You can find it in https://archive.org/details/%s" % (itemid)) + os.remove(wikizip) + + +if __name__ == "__main__": + main() diff --git a/tools/xml2titles.py b/tools/xml2titles.py new file mode 100644 index 00000000..3478cb5b --- /dev/null +++ b/tools/xml2titles.py @@ -0,0 +1,381 @@ +''' +Extracts all titles from a XML dump file and writes them to `*-xml2titles.txt`. + +requirements: + file_read_backwards +''' + +import dataclasses +import os +import argparse +import tqdm +import sys +# import re +import xml.sax +from xml.sax.saxutils import unescape + +from file_read_backwards import FileReadBackwards + + +''' + + abcde + 0 + 107 + + 238 + 2021-08-15T23:07:10Z + + user + 3 + + + wikitext + text/x-wiki + text + + +''' +class XMLBaseHandler(xml.sax.handler.ContentHandler): + '''only work on level <= 3 of the XML tree''' + + fileSize = 0 + class page__: + # TODO + pass + + def __init__(self, fileSize=0): + self.fileSize = fileSize + self.tqdm_progress = tqdm.tqdm( + total=self.fileSize, unit="B", unit_scale=True, unit_divisor=1024, desc="Parsing XML" + ) + self.globalParsedBytes = 0 + self.debugCount = 0 + self.silent = False + + self.depth = 0 + + # page + self.inPage = False + self.page = {} + self.pageTagsCount = 0 + self.pageRevisionsCount = 0 + # title + self.inTitle = False + self.title = None + self.titleTagsCount = 0 + # ns + self.inNs = False + self.ns = None + self.nsTagsCount = 0 + # id + self.inId = False + self.id = None + self.idTagsCount = 0 + # revision + self.inRevision = False + self.revision = None + self.revisionTagsCount = 0 + + def __del__(self): + self.close_tqdm() + + def close_tqdm(self): + self.tqdm_progress.close() + + def __debugCount(self): + self.debugCount += 1 + print(self.debugCount) + + def resetPageTag(self): + self.title = self.ns = self.id = self.revision = None + self.pageRevisionsCount = 0 + # print("resetPageTag") + + def startElement(self, name, attrs): + self.depth+=1 + if self.depth > 3: + self.startElementOverDepth3(name, attrs) + return + + if name == "page": + self.inPage = True + self.pageTagsCount += 1 + if name == "title": + self.inTitle = True + self.titleTagsCount += 1 + if name == "ns": + self.inNs = True + self.nsTagsCount += 1 + if name == "id": + self.inId = True + self.idTagsCount += 1 + if name == "revision": + self.inRevision = True + self.pageRevisionsCount += 1 + self.revisionTagsCount += 1 + + def endElement(self, name): + if self.depth > 3: + self.endElementOverDepth3(name) + self.depth-=1 + return + self.depth-=1 + if name == "page": + self.inPage = False + + if self.title is not None: + self.page["title"] = self.title + if self.ns is not None: + self.page["ns"] = self.ns + if self.id is not None: + self.page["id"] = self.id + if self.pageRevisionsCount is not None: + self.page["revisionsCount"] = self.pageRevisionsCount + + self.resetPageTag() + if name == "title": + self.inTitle = False + if name == "ns": + self.inNs = False + if name == "id": + self.inId = False + if name == "revision": + self.inRevision = False + + def characters(self, content, not_parse_tags=["?"]): + bufferSize = len(content.encode("utf-8")) + self.globalParsedBytes += bufferSize + # print(bufferSize) + self.tqdm_progress.update(bufferSize) # NOTE: sum(bufferSize...) != fileSize + + + if self.inPage: + pass + if self.inTitle: + # self.__debugCount() + self.cjoin("title", content) if 'title' not in not_parse_tags else None + if self.inNs: + self.cjoin("ns", content) if 'ns' not in not_parse_tags else None + if self.inId: + self.cjoin("id", content) if 'id' not in not_parse_tags else None + if self.inRevision: + self.cjoin("revision", content) if 'revision' not in not_parse_tags else None + + def endDocument(self): + if self.depth != 0: + raise RuntimeError("depth != 0 at the end of the XML document") + + def startElementOverDepth3(self, name, attrs): + pass + + def endElementOverDepth3(self, name): + pass + + def cjoin(self, obj, content): + ''' self.obj = self.obj + content if self.obj is not None else content + + obj: str + ''' + if hasattr(self, obj): + if getattr(self, obj) is None: + setattr(self, obj, content) + else: + # assert ''.join((getattr(self, obj), content)) == content if getattr(self, obj) is None else getattr(self, obj) + content + setattr(self, obj, ''.join((getattr(self, obj), content))) + pass + else: + raise AttributeError("XMLBaseHandler has no attribute %s" % obj) + setattr(self, obj, content) + + +class TitlesHandler(XMLBaseHandler): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.set_titles = set() + self.list_titles = [] + def endElement(self, name): + # print(self.revision) if name == "page" else None + super().endElement(name) + if name == "page": + if self.page['title'] is not None: + if self.page['title'] in self.set_titles: + print("Duplicate title found: %s" % self.page['title']) if not self.silent else None + else: + self.set_titles.add(self.page['title']) + self.list_titles.append(self.page['title']) # unique + if not self.silent: + print(self.page) + def characters(self, content): + return super().characters(content, not_parse_tags=["revision"]) + +class PagesHandler(XMLBaseHandler): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.pageTextsAttrs = [] + self.pageTextsRealLength = 0 + + # text + self.inText = False + self.pageTexts: str = None + self.textTagsCount = 0 + + # TODO + def startElementOverDepth3(self, name, attrs): + super().startElementOverDepth3(name, attrs) + if name == 'text' and attrs: + self.pageTextsAttrs.append(attrs.items()) + self.page['textsAttrs'] = self.pageTextsAttrs + if name == 'text': + self.inText = True + self.textTagsCount += 1 + + def endElementOverDepth3(self, name): + super().endElementOverDepth3(name) + if name == 'text': + self.inText = False + + def resetPageTag(self): + super().resetPageTag() + self.pageTextsAttrs = [] + self.pageTextsRealLength = -1 + self.pageTexts: str = None + + def endElement(self, name): + self.pageTextsRealLength = len(self.pageTexts.encode('utf-8')) if self.pageTexts is not None else 0 + self.page['textsRealLength'] = self.pageTextsRealLength + super().endElement(name) + # if name == "page": + # print(self.page) + + def characters(self, content, *args, **kwargs): + super().characters(content, *args, **kwargs) + if self.inText: + self.cjoin("pageTexts", content) + + +class MediaNsHandler(XMLBaseHandler): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # self.mediaNsPages = [] + self.mediaNsPagesName_set = set() + self.mediaNsPagesID_set = set() + def endElement(self, name): + super().endElement(name) + if name == "page": + if self.page['ns'] == '6': + if self.page['title'] in self.mediaNsPagesName_set: + if not self.silent: + print("Duplicate title found: %s" % self.page['title']) + else: + self.mediaNsPagesName_set.add(self.page['title']) + # self.mediaNsPages.append(self.page) + # print(self.page) + if self.page['id'] in self.mediaNsPagesID_set: + if not self.silent: + print("Duplicate id found: %s" % self.page['id']) + else: + self.mediaNsPagesID_set.add(self.page['id']) + # self.mediaNsPages.append(self.page) + print(self.page) + def characters(self, content): + return super().characters(content, not_parse_tags=["revision"]) + +def get_titles_from_xml(xmlfile, return_type="list", silent=False): + '''Return a list/set of titles from a XML dump file.\n + `xmlfile`: a system identifier or an InputSource.\n + `return_type`:`"list"` or `"set"` (default: `"list"`). + The `list` keeps the order of XML file, and is unique. + ''' + # xmlfile_size = os.path.getsize(xmlfile) + parser = xml.sax.make_parser() + handler = TitlesHandler(os.path.getsize(xmlfile)) + # handler = PagesHandler(os.path.getsize(xmlfile)) # TODO + # handler = MediaNsHandler(os.path.getsize(xmlfile)) # TODO + handler.silent = silent + parser.setContentHandler(handler) + parser.parse(xmlfile) + handler.close_tqdm() + print('',flush=True) + print('pageTagsCount:', handler.pageTagsCount, + 'titleTagsCount:', handler.titleTagsCount, + 'nsTagsCount:', handler.nsTagsCount, + 'idTagsCount:', handler.idTagsCount, + 'revisionTagsCount:', handler.revisionTagsCount) + # print('MediaNsPages (Name):', len(handler.mediaNsPagesName_set)) + # print('MediaNsPages (ID):', len(handler.mediaNsPagesID_set)) + + if len(handler.set_titles) != len(handler.list_titles): + raise RuntimeError("len(set_titles) and (list_titles) are not equal!") + + titles = handler.set_titles if return_type == "set" else handler.list_titles + + return titles + + +@dataclasses.dataclass +class Config: + xmlfile: str + dry: bool + verbose: bool + +def getArguments(): + parser = argparse.ArgumentParser() + + parser.description = "Extracts all titles from a XML dump file and writes them to `*-xml2titles.txt`." + parser.add_argument("xmlfile", help="XML file of wiki dump") + parser.add_argument("--dry", help="Do not write to file",action="store_true") + parser.add_argument("--verbose", help="Verbose",action="store_true") + + args = parser.parse_args() + config = Config + config.xmlfile = args.xmlfile + config.dry = args.dry + config.verbose = args.verbose + + return config + + +if __name__ == "__main__": + args = getArguments() + + print('Parsing...') + + xmlfile = args.xmlfile + if not os.path.exists(xmlfile): + print("XML file does not exist!") + sys.exit(1) + + xml_basename = os.path.basename(xmlfile) + xml_dir = os.path.dirname(xmlfile) + + assert xml_basename.endswith(".xml") + "XML file name does not end with .xml!" + assert xml_basename.endswith("-current.xml") or xml_basename.endswith("-history.xml") + "XML file name does not end with -current.xml or -history.xml!" + + with FileReadBackwards(xmlfile, encoding='utf-8') as frb: + seeked = 0 + for line in frb: + seeked += 1 + if "" in line: + # xml dump is complete + break + if seeked > 4: + raise Exception('xml dump is incomplete!') + + _silent = not args.verbose + + titles = get_titles_from_xml(xmlfile=xmlfile, return_type="list", silent=_silent) + + if args.dry: + print("Dry run. No file will be written.") + sys.exit(0) + + titles_filename = xml_basename.replace("-current.xml", "-xml2titles.txt").replace("-history.xml", "-xml2titles.txt") + titles_filepath = os.path.join(xml_dir, titles_filename) + with open(titles_filepath, "w") as f: + f.write("\n".join(titles)) + f.write("\n--END--\n") + + print("Done! %d titles extracted to %s" % (len(titles), titles_filepath)) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 1316f6ea..00000000 --- a/tox.ini +++ /dev/null @@ -1,21 +0,0 @@ -[tox] -# 1.6 for skipsdist -minversion = 1.6 -# Dont use setup.py (there is none) -skipsdist = True -# List of environement to run by default -envlist = py27 - -[testenv] -# Test command with output reported -commands = nosetests --nocapture --nologcapture -deps = - nose - -rrequirements.txt - -[testenv:flake8] -commands = flake8 {posargs} -deps = flake8 - -[flake8] -exclude = .tox,.venv,build,dist,doc diff --git a/uploader.py b/uploader.py deleted file mode 100644 index 73626f79..00000000 --- a/uploader.py +++ /dev/null @@ -1,303 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -# Copyright (C) 2011-2016 WikiTeam -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import getopt -import argparse -import os -import re -import subprocess -import sys -import time -import requests -import urlparse -from io import BytesIO -from xml.sax.saxutils import quoteattr -from internetarchive import get_item - -import dumpgenerator - -# You need a file named keys.txt with access and secret keys, in two different lines -accesskey = open('keys.txt', 'r').readlines()[0].strip() -secretkey = open('keys.txt', 'r').readlines()[1].strip() - -# Nothing to change below -convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'} - -def log(wiki, dump, msg, config={}): - f = open('uploader-%s.log' % (config.listfile), 'a') - f.write('\n%s;%s;%s' % (wiki, dump, msg)) - f.close() - -def upload(wikis, config={}, uploadeddumps=[]): - headers = {'User-Agent': dumpgenerator.getUserAgent()} - dumpdir = config.wikidump_dir - - filelist = os.listdir(dumpdir) - for wiki in wikis: - print "#"*73 - print "# Uploading", wiki - print "#"*73 - wiki = wiki.lower() - configtemp = config - try: - prefix = dumpgenerator.domain2prefix(config={'api': wiki}) - except KeyError: - print "ERROR: could not produce the prefix for %s" % wiki - config = configtemp - - wikiname = prefix.split('-')[0] - dumps = [] - for f in filelist: - if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')): - print "%s found" % f - dumps.append(f) - # Re-introduce the break here if you only need to upload one file - # and the I/O is too slow - # break - - c = 0 - for dump in dumps: - wikidate = dump.split('-')[1] - item = get_item('wiki-' + wikiname) - if dump in uploadeddumps: - if config.prune_directories: - rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate) - # With -f the deletion might have happened before and we won't know - if not os.system(rmline): - print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate) - if config.prune_wikidump and dump.endswith('wikidump.7z'): - # Simplistic quick&dirty check for the presence of this file in the item - print "Checking content in previously uploaded files" - stdout, stderr = subprocess.Popen(["md5sum", dumpdir + '/' + dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() - dumphash = re.sub(' +.+\n?', '', stdout) - - if dumphash in map(lambda x: x['md5'], item.files): - log(wiki, dump, 'verified', config) - rmline='rm -rf %s' % dumpdir + '/' + dump - if not os.system(rmline): - print 'DELETED ' + dumpdir + '/' + dump - print '%s was uploaded before, skipping...' % (dump) - continue - else: - print 'ERROR: The online item misses ' + dump - log(wiki, dump, 'missing', config) - # We'll exit this if and go upload the dump - else: - print '%s was uploaded before, skipping...' % (dump) - continue - else: - print '%s was not uploaded before' % dump - - time.sleep(0.1) - wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8] - print wiki, wikiname, wikidate, dump - - # Does the item exist already? - ismissingitem = not item.exists - - # Logo path - logourl = '' - - if ismissingitem or config.update: - #get metadata from api.php - #first sitename and base url - params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'} - try: - r = requests.get(url=wiki, params=params, headers=headers) - if r.status_code < 400: - xml = r.text - except requests.exceptions.ConnectionError as e: - pass - - sitename = '' - baseurl = '' - lang = '' - try: - sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0] - except: - pass - try: - baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0] - except: - pass - try: - lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0] - except: - pass - - if not sitename: - sitename = wikiname - if not baseurl: - baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki) - # Convert protocol-relative URLs - baseurl = re.sub('^//', 'https://', baseurl) - if lang: - lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower() - - #now copyright info from API - params = {'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|rightsinfo', 'format': 'xml'} - xml = '' - try: - r = requests.get(url=wiki, params=params, headers=headers) - if r.status_code < 400: - xml = r.text - except requests.exceptions.ConnectionError as e: - pass - - rightsinfourl = '' - rightsinfotext = '' - try: - rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0] - rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0] - except: - pass - - raw = '' - try: - r = requests.get(url=baseurl, headers=headers) - if r.status_code < 400: - raw = r.text - except requests.exceptions.ConnectionError as e: - pass - - #or copyright info from #footer in mainpage - if baseurl and not rightsinfourl and not rightsinfotext: - print("INFO: Getting license from the HTML") - rightsinfotext = '' - rightsinfourl = '' - try: - rightsinfourl = re.findall(ur"", raw)[0] - except: - pass - try: - rightsinfotext = re.findall(ur"
    • ([^\n\r]*?)
    • ", raw)[0] - except: - pass - if rightsinfotext and not rightsinfourl: - rightsinfourl = baseurl + '#footer' - try: - logourl = re.findall(ur'p-logo["\'][^>]*>\s*
      ]*background-image:\s*(?:url\()?([^;)"]+)', raw) - if logourl: - logourl = logourl[0] - else: - logourl = re.findall(ur'"wordmark-image">[^<]*]*>[^<]*%s dumped with WikiTeam tools." % (baseurl, sitename)# "ECGpedia,: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with WikiTeam tools." - wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki - - if not rightsinfourl and not rightsinfotext: - wikikeys.append('unknowncopyright') - if "www.fandom.com" in rightsinfourl and "/licensing" in rightsinfourl: - # Link the default license directly instead - rightsinfourl = "https://creativecommons.org/licenses/by-sa/3.0/" - wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/ - wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found. - - wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php - else: - print 'Item already exists.' - lang = 'foo' - wikititle = 'foo' - wikidesc = 'foo' - wikikeys = 'foo' - wikilicenseurl = 'foo' - wikirights = 'foo' - wikiurl = 'foo' - - if c == 0: - # Item metadata - md = { - 'mediatype': 'web', - 'collection': config.collection, - 'title': wikititle, - 'description': wikidesc, - 'language': lang, - 'last-updated-date': wikidate_text, - 'subject': '; '.join(wikikeys), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ... - 'licenseurl': wikilicenseurl and urlparse.urljoin(wiki, wikilicenseurl), - 'rights': wikirights, - 'originalurl': wikiurl, - } - - #Upload files and update metadata - try: - item.upload(dumpdir + '/' + dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False) - item.modify_metadata(md) # update - print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname) - uploadeddumps.append(dump) - except Exception as e: - print wiki, dump, 'Error when uploading?' - print(e) - try: - log(wiki, dump, 'ok', config) - if logourl: - logo = BytesIO(requests.get(logourl, timeout=10).content) - if '.png' in logourl: - logoextension = 'png' - elif logourl.split('.'): - logoextension = logourl.split('.')[-1] - else: - logoextension = 'unknown' - logoname = 'wiki-' + wikiname + '_logo.' + logoextension - item.upload({logoname: logo}, access_key=accesskey, secret_key=secretkey, verbose=True) - except requests.exceptions.ConnectionError: - print(e) - - c += 1 - -def main(params=[]): - parser = argparse.ArgumentParser("""uploader.py - -This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org. -The list must be a text file with the wiki's api.php URLs, one per line. -Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format -as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ). -You need a file named keys.txt with access and secret keys, in two different lines -You also need dumpgenerator.py in the same directory as this script. - -Use --help to print this help.""") - - parser.add_argument('-pd', '--prune_directories', action='store_true') - parser.add_argument('-pw', '--prune_wikidump', action='store_true') - parser.add_argument('-a', '--admin', action='store_true') - parser.add_argument('-c', '--collection', default='opensource') - parser.add_argument('-wd', '--wikidump_dir', default='.') - parser.add_argument('-u', '--update', action='store_true') - parser.add_argument('listfile') - config = parser.parse_args() - if config.admin: - config.collection = 'wikiteam' - uploadeddumps = [] - listfile = config.listfile - try: - uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1] - except: - pass - print '%d dumps uploaded previously' % (len(uploadeddumps)) - wikis = open(listfile, 'r').read().strip().splitlines() - - upload(wikis, config, uploadeddumps) - -if __name__ == "__main__": - main() diff --git a/wikiadownloader.py b/wikiadownloader.py deleted file mode 100644 index a8b451f0..00000000 --- a/wikiadownloader.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -# Copyright (C) 2011 WikiTeam -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# using a list of wikia subdomains, it downloads all dumps available in Special:Statistics pages -# you can use the list available at the "listofwikis" directory, the file is called wikia.com and it contains +200k wikis - -import datetime -import os -import re -import sys -import urllib - -""" -instructions: - -it requires a list of wikia wikis -there is one in the repository (listofwikis directory) - -run it: python wikiadownloader.py - -it you want to resume: python wikiadownloader.py wikitostartfrom - -where wikitostartfrom is the last downloaded wiki in the previous session - -""" - -f = open('wikia.com', 'r') -wikia = f.read().strip().split('\n') -f.close() - -print >>sys.stderr, len(wikia), 'wikis in Wikia' - -start = '!' -if len(sys.argv) > 1: - start = sys.argv[1] - -for wiki in wikia: - wiki = wiki.lower() - prefix = wiki.split('http://')[1] - if prefix < start: - continue - print >>sys.stderr, "Starting:", wiki - - f = urllib.urlopen('%s/wiki/Special:Statistics' % (wiki)) - html = f.read() - f.close() - - m = re.compile(r'(?i)(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P') - itemfiles = [] - for raw2_ in raw2: - try: - x = re.findall(r'(?im)', raw2_)[0] - y = re.findall(r'(?im)(\d+)', raw2_)[0] - itemfiles.append([int(x), int(y)]) - except: - pass - - itemfiles.sort(reverse=True) - print(itemfiles) - itemdate = str(itemfiles[0][0])[0:4] + '/' + str(itemfiles[0][0])[4:6] + '/' + str(itemfiles[0][0])[6:8] - itemsize = itemfiles[0][1] - - iaparams = """|Internet Archive identifier=%s -|Internet Archive URL=%s -|Internet Archive added date=%s 00:00:00 -|Internet Archive file size=%s""" % (itemidentifier, itemurl, itemdate, itemsize) - newtext = page.text - newtext = re.sub(r'(?im)\n\}\}', '\n%s\n}}' % (iaparams), newtext) - - if page.text != newtext: - pywikibot.showDiff(page.text, newtext) - page.text = newtext - page.save('BOT - Adding dump details: %s, %s, %s bytes' % (itemidentifier, itemdate, itemsize), botflag=True) - -if __name__ == "__main__": - main() - diff --git a/wikimediacommons/commons-update-status.py b/wikimediacommons/commons-update-status.py deleted file mode 100644 index 00e64407..00000000 --- a/wikimediacommons/commons-update-status.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf8 -*- - -# Copyright (C) 2012-2016 WikiTeam developers -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import json -import urllib - -def main(): - queryurl = 'https://archive.org/advancedsearch.php?q=collection%3Awikimediacommons&fl[]=identifier&sort[]=&sort[]=&sort[]=&rows=1000&page=1&output=json&callback=callback' - raw = urllib.urlopen(queryurl).read() - raw = raw.split('callback(')[1].strip(')') - result = json.loads(raw)['response']['docs'] - - identifiers = {} - for item in result: - identifier = item['identifier'] - if 'wikimediacommons-20' in identifier: - date = identifier.split('wikimediacommons-')[1] - t = date.split('-') - if len(t) == 1: - if len(t[0]) == 4: # YYYY - identifiers[t[0]] = identifier - elif len(t[0]) == 6: # YYYYMM - identifiers['%s-%s' % (t[0][:4], t[0][4:6])] = identifier - elif len(t[0]) == 8: # YYYYMMDD - identifiers['%s-%s-%s' % (t[0][:4], t[0][4:6], t[0][6:8])] = identifier - else: - print('ERROR, dont understand date format in %s' % (identifier)) - elif len(t) == 2: - if len(t[0]) == 4 and len(t[1]) == 2: #YYYY-MM - identifiers['%s-%s' % (t[0], t[1])] = identifier - else: - print('ERROR, dont understand date format in %s' % (identifier)) - elif len(t) == 3: - if len(t[0]) == 4 and len(t[1]) == 2 and len(t[2]) == 2: #YYYY-MM-DD - identifiers['%s-%s-%s' % (t[0], t[1], t[2])] = identifier - else: - print('ERROR, dont understand date format in %s' % (identifier)) - - identifiers_list = [[k, v] for k, v in identifiers.items()] - identifiers_list.sort() - - rows = ["|-\n| %s || [https://archive.org/details/%s %s] || ??? || ???" % (k, v, v) for k, v in identifiers_list] - output = """ -{| class="wikitable sortable" -! Date !! Identifier !! Files !! Size (GB) -%s -|}""" % ('\n'.join(rows)) - print(output) - -if __name__ == '__main__': - main() diff --git a/wikimediacommons/commonschecker.py b/wikimediacommons/commonschecker.py deleted file mode 100644 index fa5bd768..00000000 --- a/wikimediacommons/commonschecker.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf8 -*- -# Copyright (C) 2011-2012 WikiTeam -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import csv -import datetime -try: - from hashlib import md5 -except ImportError: # Python 2.4 compatibility - from md5 import new as md5 -import os -import re -import sys -import zipfile - -def welcome(): - """ """ - print "#"*73 - print """# Welcome to CommonsChecker 0.1 by WikiTeam (GPL v3) # -# More info at: http://code.google.com/p/wikiteam/ #""" - print "#"*73 - print '' - print "#"*73 - print """# Copyright (C) 2011-2012 WikiTeam # -# This program is free software: you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation, either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program. If not, see . #""" - print "#"*73 - print '' - -def main(): - welcome() - - startdate = '' - enddate = '' - delta = datetime.timedelta(days=1) #chunks by day - if len(sys.argv) == 1: - print 'Usage example: python script.py 2005-01-01 2005-01-10 [to check the first 10 days of 2005]' - sys.exit() - elif len(sys.argv) == 2: #use sys.argv[1] as start and enddata, just check a day - startdate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d') - enddate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d') - elif len(sys.argv) == 3: - startdate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d') - enddate = datetime.datetime.strptime(sys.argv[2], '%Y-%m-%d') - else: - sys.exit() - - print "Checking Wikimedia Commons files from %s to %s" % (startdate.strftime('%Y-%m-%d'), enddate.strftime('%Y-%m-%d')) - while startdate <= enddate: - print '== %s ==' % (startdate.strftime('%Y-%m-%d')) - filenamecsv = startdate.strftime('%Y-%m-%d.csv') - filenamezip = startdate.strftime('%Y-%m-%d.zip') - if os.path.exists(filenamecsv): - f = csv.reader(open(filenamecsv, 'r'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) - if os.path.exists(filenamezip): - zipfiles = zipfile.ZipFile(filenamezip, 'r').infolist() - errors = [] - files_in_zip = [] - csv_data_dict = {} - csv_file_list = [] - files = {} - for img_name, img_saved_as, img_timestamp, img_user, img_user_text, img_size, img_width, img_height in f: - csv_data_dict[unicode('%s/%s' % (startdate.strftime('%Y/%m/%d'), img_saved_as), 'utf-8')] = {'img_name':img_name, 'img_saved_as':img_saved_as, 'img_timestamp':img_timestamp, 'img_user':img_user, 'img_user_text':img_user_text, 'img_size':img_size, 'img_width':img_width, 'img_height':img_height} - csv_file_list.append(unicode('%s/%s' % (startdate.strftime('%Y/%m/%d'), img_saved_as), 'utf-8')) - for i in zipfiles: - files_in_zip.append(i.filename) - files[i.filename] = i - combined = list(set(files_in_zip) & set(csv_file_list)) - for name in set(combined): - csv_img = csv_data_dict[name] - if csv_img['img_timestamp'].startswith(startdate.strftime('%Y%m%d')): - #check img_saved_as existence in zip and check size - #img_saved_as = unicode(img_saved_as, 'utf-8') - ok = False - error = 'missing' - i= files[name] - if str(i.file_size) == csv_img['img_size']: - ok = True - elif i.file_size == 0: - error = 'empty' - else: - error = 'corrupt (%s of %s bytes)' % (i.file_size, csv_img['img_size']) - if not ok: - print csv_img['img_name'], csv_img['img_saved_as'], error - errors.append([csv_img['img_saved_as'], error]) - if errors: - print 'This .zip contains errors:' - print '\n'.join([' -> "%s" is %s' % (filename, error) for filename, error in errors]) - else: - print 'No errors found' - else: - print 'Error, no %s available' % (filenamezip) - startdate += delta -if __name__ == "__main__": - main() diff --git a/wikimediacommons/commonsdownloader.py b/wikimediacommons/commonsdownloader.py deleted file mode 100644 index d1ee89d7..00000000 --- a/wikimediacommons/commonsdownloader.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf8 -*- - -# Copyright (C) 2011-2016 WikiTeam developers -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import csv -import datetime -try: - from hashlib import md5 -except ImportError: # Python 2.4 compatibility - from md5 import new as md5 -import os -import re -import sys -import urllib - -def welcome(): - """ """ - print "#"*73 - print """# Welcome to CommonsDownloader 0.1 by WikiTeam (GPL v3) # -# More info: https://github.com/WikiTeam/wikiteam #""" - print "#"*73 - print '' - print "#"*73 - print """# Copyright (C) 2011-2016 WikiTeam # -# This program is free software: you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation, either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program. If not, see . #""" - print "#"*73 - print '' - -def bye(): - """ """ - print "---> Congratulations! Your dump is complete <---" - print "If you found any bug, report a new issue here (Google account required): http://code.google.com/p/wikiteam/issues/list" - print "If this is a public wiki, please, consider sending us a copy of this dump. Contact us at http://code.google.com/p/wikiteam" - print "Good luck! Bye!" - -def main(): - welcome() - - filenamefeed = 'commonssql.csv' # feed - startdate = '' - enddate = '' - delta = datetime.timedelta(days=1) #chunks by day - filenamelimit = 100 #do not change!!! - if len(sys.argv) == 1: - print 'Usage example: python script.py 2005-01-01 2005-01-10 [to download the first 10 days of 2005]' - sys.exit() - elif len(sys.argv) == 2: #use sys.argv[1] as start and enddata, just download a day - startdate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d') - enddate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d') - elif len(sys.argv) == 3: - startdate = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d') - enddate = datetime.datetime.strptime(sys.argv[2], '%Y-%m-%d') - else: - sys.exit() - - print "Downloading Wikimedia Commons files from %s to %s" % (startdate.strftime('%Y-%m-%d'), enddate.strftime('%Y-%m-%d')) - while startdate <= enddate: - print '== %s ==' % (startdate.strftime('%Y-%m-%d')) - savepath = startdate.strftime('%Y/%m/%d') - filenamecsv = startdate.strftime('%Y-%m-%d.csv') - filenamezip = startdate.strftime('%Y-%m-%d.zip') - c = 0 - f = csv.reader(open(filenamefeed, 'r'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) - for img_name, img_timestamp, img_user, img_user_text, img_size, img_width, img_height in f: - if img_timestamp.startswith(startdate.strftime('%Y%m%d')): - if not c: #first loop - try: #create savepath if not exists - os.makedirs(savepath) - except: - pass - #csv header - h = open(filenamecsv, 'w') - h.write('img_name|img_saved_as|img_timestamp|img_user|img_user_text|img_size|img_width|img_height\n') - h.close() - - img_name = unicode(img_name, 'utf-8') - img_user_text = unicode(img_user_text, 'utf-8') - original_name = img_name - if re.search(ur"(?m)^\d{14}\!", original_name): #removing 20101005024534! (or similar) from name if present - original_name = original_name[15:] - # quote weird chars to avoid errors while wgetting - img_name_quoted = urllib.quote(re.sub(r' ', r'_', img_name.encode('utf-8'))) - # _ ending variables contains no spaces, and \" for command line - img_name_ = re.sub(r'"', r'\"', re.sub(r' ', r'_', img_name.encode('utf-8'))) # do not use ur'', it is encoded - original_name_ = re.sub(r'"', r'\"', re.sub(r' ', r'_', original_name.encode('utf-8'))) # do not use ur'', it is encoded - md5hash = md5(re.sub(' ', '_', original_name.encode("utf-8"))).hexdigest() # do not use image_name, md5 needs the original name and without \" - img_saved_as = '' - img_saved_as_ = '' - if len(img_name) > filenamelimit: #truncate filename if it is long - img_saved_as = img_name[:filenamelimit] + md5(re.sub(' ', '_', img_name.encode('utf-8'))).hexdigest() + '.' + img_name.split('.')[-1] - img_saved_as = re.sub(r' ', r'_', img_saved_as) # do not use ur'', it is encoded - img_saved_as_ = re.sub(r'"', r'\"', re.sub(r' ', r'_', img_saved_as.encode('utf-8'))) # do not use ur'', it is encoded - else: - img_saved_as = re.sub(r' ', r'_', img_name) # do not use ur'', it is encoded - img_saved_as_ = re.sub(r'"', r'\"', re.sub(r' ', r'_', img_name.encode('utf-8'))) # do not use ur'', it is encoded - print img_name, img_saved_as, img_timestamp - - #wget file - if original_name != img_name: #the image is an old version, download using /archive/ path in server - os.system('wget -c "https://upload.wikimedia.org/wikipedia/commons/archive/%s/%s/%s" -O "%s/%s"' % (md5hash[0], md5hash[0:2], img_name_quoted, savepath, img_saved_as_)) - try: - if not os.path.getsize('%s/%s' % (savepath, img_saved_as_)): #empty file?... - #probably false 20101005024534! begining like this http://commons.wikimedia.org/wiki/File:20041028210012!Pilar.jpg - #ok, restore original_name to ! version and recalculate md5 and other variables that use original_name as source - original_name = img_name - original_name_ = re.sub(r'"', r'\"', re.sub(r' ', r'_', original_name.encode('utf-8'))) - md5hash = md5(re.sub(' ', '_', original_name.encode("utf-8"))).hexdigest() - #redownload, now without /archive/ subpath - os.system('wget -c "https://upload.wikimedia.org/wikipedia/commons/%s/%s/%s" -O "%s/%s"' % (md5hash[0], md5hash[0:2], img_name_quoted, savepath, img_saved_as_)) - except OSError: - pass - else: - # Issue #66 : try your.org first - os.system('wget -c "http://ftpmirror.your.org/pub/wikimedia/images/wikipedia/commons/%s/%s/%s" -O "%s/%s"' % (md5hash[0], md5hash[0:2], img_name_quoted, savepath, img_saved_as_)) - os.system('wget -c "https://upload.wikimedia.org/wikipedia/commons/%s/%s/%s" -O "%s/%s"' % (md5hash[0], md5hash[0:2], img_name_quoted, savepath, img_saved_as_)) - - #curl .xml description page with full history - os.system('curl -d "&pages=File:%s&history=1&action=submit" https://commons.wikimedia.org/w/index.php?title=Special:Export -o "%s/%s.xml"' % (original_name_, savepath, img_saved_as_)) - - #save csv info - g = csv.writer(open(filenamecsv, 'a'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) - g.writerow([img_name.encode('utf-8'), img_saved_as.encode('utf-8'), img_timestamp, img_user, img_user_text.encode('utf-8'), img_size, img_width, img_height]) - c += 1 - #zip downloaded files; add mT to the options if you want to save space by removing the downloaded files day by day; commonschecker needs only zip and csv - os.system('zip -9r %s %s/' % (filenamezip, savepath)) - startdate += delta - bye() - -if __name__ == "__main__": - main() diff --git a/wikipediadownloader.py b/wikipediadownloader.py deleted file mode 100644 index 15d23c88..00000000 --- a/wikipediadownloader.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -# Copyright (C) 2011-2014 WikiTeam -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -import argparse -import os -import re -import sys -import time -import urllib - - -def main(): - parser = argparse.ArgumentParser( - description='Downloader of Wikimedia dumps') - #parser.add_argument('-f', '--families', help='Choose which family projects to download (e.g. all, wikipedia, wikibooks, wikinews, wikiquote, wikisource, wikivoyage, wiktionary)', required=False) - parser.add_argument( - '-r', '--maxretries', help='Max retries to download a dump when md5sum doesn\'t fit. Default: 3', required=False) - parser.add_argument( - '-s', '--start', help='Start to download from this project (e.g.: eswiki, itwikisource, etc)', required=False) - args = parser.parse_args() - - maxretries = 3 - if args.maxretries and int(args.maxretries) >= 0: - maxretries = int(args.maxretries) - - dumpsdomain = 'http://dumps.wikimedia.org' - f = urllib.urlopen('%s/backup-index.html' % (dumpsdomain)) - raw = f.read() - f.close() - - m = re.compile( - r'[^<]+: Dump complete').finditer(raw) - projects = [] - for i in m: - projects.append([i.group('project'), i.group('date')]) - projects.reverse() # download oldest dumps first - #projects = [['enwiki', '20130805']] - - start = args.start - for project, date in projects: - if start: - if start != project: - print 'Skipping %s, %s' % (project, date) - continue - else: - start = '' # reset - - print '-' * 50, '\n', 'Checking', project, date, '\n', '-' * 50 - time.sleep(1) # ctrl-c - f = urllib.urlopen('%s/%s/%s/' % (dumpsdomain, project, date)) - htmlproj = f.read() - # print htmlproj - f.close() - - for dumpclass in ['pages-meta-history\d*\.xml[^\.]*\.7z']: - corrupted = True - maxretries2 = maxretries - while corrupted and maxretries2 > 0: - maxretries2 -= 1 - m = re.compile(r'' % - (project, date, project, date, dumpclass)).finditer(htmlproj) - urldumps = [] - # enwiki is splitted in several files, thats why we need a loop - # here - for i in m: - urldumps.append( - '%s/%s' % (dumpsdomain, i.group('urldump'))) - - # print urldumps - for urldump in urldumps: - dumpfilename = urldump.split('/')[-1] - path = '%s/%s' % (dumpfilename[0], project) - if not os.path.exists(path): - os.makedirs(path) - os.system('wget -c %s -O %s/%s' % - (urldump, path, dumpfilename)) - - # md5check - os.system('md5sum %s/%s > md5' % (path, dumpfilename)) - f = open('md5', 'r') - raw = f.read() - f.close() - md51 = re.findall( - r'(?P[a-f0-9]{32})\s+%s/%s' % (path, dumpfilename), raw)[0] - print md51 - - f = urllib.urlopen( - '%s/%s/%s/%s-%s-md5sums.txt' % (dumpsdomain, project, date, project, date)) - raw = f.read() - f.close() - f = open('%s/%s-%s-md5sums.txt' % - (path, project, date), 'w') - f.write(raw) - f.close() - md52 = re.findall( - r'(?P[a-f0-9]{32})\s+%s' % (dumpfilename), raw)[0] - print md52 - - if md51 == md52: - print 'md5sum is correct for this file, horay! \o/' - print '\n' * 3 - corrupted = False - else: - os.remove('%s/%s' % (path, dumpfilename)) - -if __name__ == '__main__': - main() diff --git a/wikispaces.py b/wikispaces.py deleted file mode 100644 index e134fc1f..00000000 --- a/wikispaces.py +++ /dev/null @@ -1,458 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- - -# Copyright (C) 2018 WikiTeam developers -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki -# Documentation for developers: http://wikiteam.readthedocs.com - -import csv -import datetime -import os -import random -import re -import subprocess -import sys -import time -import urllib.request -#from internetarchive import get_item - -# Requirements: -# zip command (apt-get install zip) -# ia command (pip install internetarchive, and configured properly) - -""" -# You need a file with access and secret keys, in two different lines -iakeysfilename = '%s/.iakeys' % (os.path.expanduser('~')) -if os.path.exists(iakeysfilename): - accesskey = open(iakeysfilename, 'r').readlines()[0].strip() - secretkey = open(iakeysfilename, 'r').readlines()[1].strip() -else: - print('Error, no %s file with S3 keys for Internet Archive account' % (iakeysfilename)) - sys.exit() -""" - -def saveURL(wikidomain='', url='', filename='', path='', overwrite=False, iteration=1): - filename2 = '%s/%s' % (wikidomain, filename) - if path: - filename2 = '%s/%s/%s' % (wikidomain, path, filename) - if os.path.exists(filename2): - if not overwrite: - print('Warning: file exists on disk. Skipping download. Force download with parameter --overwrite') - return - opener = urllib.request.build_opener() - opener.addheaders = [('User-agent', 'Mozilla/5.0')] - urllib.request.install_opener(opener) - try: - urllib.request.urlretrieve(url, filename2) - except: - sleep = 10 # seconds - maxsleep = 30 - while sleep <= maxsleep: - try: - print('Error while retrieving: %s' % (url)) - print('Retry in %s seconds...' % (sleep)) - time.sleep(sleep) - urllib.request.urlretrieve(url, filename2) - return - except: - sleep = sleep * 2 - print('Download failed') - - #sometimes wikispaces returns invalid data, redownload in that cases - #only 'pages'. 'files' binaries are a pain to open and check - if (os.path.exists(filename2) and 'pages' in path) or \ - (os.path.exists(filename2) and path == '' and filename2.split('.')[-1] in ['xml', 'html', 'csv']): - sleep2 = 60 * iteration - raw = '' - try: - with open(filename2, 'r', encoding='utf-8') as f: - raw = f.read() - except: - with open(filename2, 'r', encoding='latin-1') as f: - raw = f.read() - if re.findall(r'(?im)TES and THE Status', raw): - print('Warning: invalid content. Waiting %d seconds and re-downloading' % (sleep2)) - time.sleep(sleep2) - saveURL(wikidomain=wikidomain, url=url, filename=filename, path=path, overwrite=overwrite, iteration=iteration+1) - -def undoHTMLEntities(text=''): - """ Undo some HTML codes """ - - # i guess only < > & " ' need conversion - # http://www.w3schools.com/html/html_entities.asp - text = re.sub('<', '<', text) - text = re.sub('>', '>', text) - text = re.sub('&', '&', text) - text = re.sub('"', '"', text) - text = re.sub(''', '\'', text) - - return text - -def convertHTML2Wikitext(wikidomain='', filename='', path=''): - wikitext = '' - wikitextfile = '%s/%s/%s' % (wikidomain, path, filename) - if not os.path.exists(wikitextfile): - print('Error retrieving wikitext, page is a redirect probably') - return - with open(wikitextfile, 'r') as f: - wikitext = f.read() - with open(wikitextfile, 'w') as f: - m = re.findall(r'(?im)
      \s*
      ', wikitext)
      -        if m:
      -            try:
      -                wikitext = wikitext.split(m[0])[1].split('
      ')[0].strip() - wikitext = undoHTMLEntities(text=wikitext) - except: - pass - f.write(wikitext) - -def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False): - pagenameplus = re.sub(' ', '+', pagename) - pagename_ = urllib.parse.quote(pagename) - - #page current revision (html & wikitext) - pageurl = '%s/%s' % (wikiurl, pagename_) - filename = '%s.html' % (pagenameplus) - print('Downloading page: %s' % (filename)) - saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages', overwrite=overwrite) - pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_) - filename2 = '%s.wikitext' % (pagenameplus) - print('Downloading page: %s' % (filename2)) - saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite) - convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages') - - #csv with page history - csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_) - csvfilename = '%s.history.csv' % (pagenameplus) - print('Downloading page: %s' % (csvfilename)) - saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages', overwrite=overwrite) - -def downloadFile(wikidomain='', wikiurl='', filename='', overwrite=False): - filenameplus = re.sub(' ', '+', filename) - filename_ = urllib.parse.quote(filename) - - #file full resolution - fileurl = '%s/file/view/%s' % (wikiurl, filename_) - filename = filenameplus - print('Downloading file: %s' % (filename)) - saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files', overwrite=overwrite) - - #csv with file history - csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_) - csvfilename = '%s.history.csv' % (filenameplus) - print('Downloading file: %s' % (csvfilename)) - saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files', overwrite=overwrite) - -def downloadPagesAndFiles(wikidomain='', wikiurl='', overwrite=False): - print('Downloading Pages and Files from %s' % (wikiurl)) - #csv all pages and files - csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl) - saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='') - #download every page and file - totallines = 0 - with open('%s/pages-and-files.csv' % (wikidomain), 'r') as f: - totallines = len(f.read().splitlines()) - 1 - with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile: - filesc = 0 - pagesc = 0 - print('This wiki has %d pages and files' % (totallines)) - rows = csv.reader(csvfile, delimiter=',', quotechar='"') - for row in rows: - if row[0] == 'file': - filesc += 1 - filename = row[1] - downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename, overwrite=overwrite) - elif row[0] == 'page': - pagesc += 1 - pagename = row[1] - downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename, overwrite=overwrite) - if (filesc + pagesc) % 10 == 0: - print(' Progress: %d of %d' % ((filesc + pagesc), totallines)) - print(' Progress: %d of %d' % ((filesc + pagesc), totallines)) - print('Downloaded %d pages' % (pagesc)) - print('Downloaded %d files' % (filesc)) - -def downloadSitemap(wikidomain='', wikiurl='', overwrite=False): - print('Downloading sitemap.xml') - saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='', overwrite=overwrite) - -def downloadMainPage(wikidomain='', wikiurl='', overwrite=False): - print('Downloading index.html') - saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='', overwrite=overwrite) - -def downloadLogo(wikidomain='', wikiurl='', overwrite=False): - index = '%s/index.html' % (wikidomain) - if os.path.exists(index): - raw = '' - try: - with open(index, 'r', encoding='utf-8') as f: - raw = f.read() - except: - with open(index, 'r', encoding='latin-1') as f: - raw = f.read() - m = re.findall(r'class="WikiLogo WikiElement"> 2: - if '--upload' in sys.argv: - upload = True - if '--admin' in sys.argv: - isadmin = True - if '--overwrite' in sys.argv: - overwrite = True - if '--overwrite-ia' in sys.argv: - overwriteia = True - if '--help' in sys.argv: - printhelp() - - wikilist = [] - if '://' in param: - wikilist.append(param.rstrip('/')) - elif param.lower() == 'duckduckgo': - wikilist = duckduckgo() - #for wiki in wikilist: - # print(wiki) - else: - with open(param, 'r') as f: - wikilist = f.read().strip().splitlines() - wikilist2 = [] - for wiki in wikilist: - wikilist2.append(wiki.rstrip('/')) - wikilist = wikilist2 - - for wikiurl in wikilist: - wikidomain = wikiurl.split('://')[1].split('/')[0] - print('\n') - print('#'*40,'\n Downloading:', wikiurl) - print('#'*40,'\n') - - if upload and not overwriteia: - itemid = 'wiki-%s' % (wikidomain) - try: - iahtml = '' - try: - iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8') - except: - time.sleep(10) - iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8') - if iahtml and not re.findall(r'(?im)Item cannot be found', iahtml): - if not overwriteia: - print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia') - print('You can find it in https://archive.org/details/%s' % (itemid)) - time.sleep(1) - continue - except: - pass - - dirfiles = '%s/files' % (wikidomain) - if not os.path.exists(dirfiles): - print('Creating directory %s' % (dirfiles)) - os.makedirs(dirfiles) - dirpages = '%s/pages' % (wikidomain) - if not os.path.exists(dirpages): - print('Creating directory %s' % (dirpages)) - os.makedirs(dirpages) - sitemapurl = 'https://%s/sitemap.xml' % (wikidomain) - - downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite) - if not os.path.exists('%s/sitemap.xml' % (wikidomain)): - print('Error, wiki was probably deleted. Skiping wiki...') - continue - else: - sitemapraw = '' - try: - with open('%s/sitemap.xml' % (wikidomain), encoding='utf-8') as g: - sitemapraw = g.read() - except: - with open('%s/sitemap.xml' % (wikidomain), encoding='latin-1') as g: - sitemapraw = g.read() - if re.search(r'(?im)

      This wiki has been deactivated

      ', sitemapraw): - print('Error, wiki was deactivated. Skiping wiki...') - continue - - downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite) - if not os.path.exists('%s/index.html' % (wikidomain)): - print('Error, wiki was probably deleted or expired. Skiping wiki...') - continue - else: - indexraw = '' - try: - with open('%s/index.html' % (wikidomain), encoding='utf-8') as g: - indexraw = g.read() - except: - with open('%s/index.html' % (wikidomain), encoding='latin-1') as g: - indexraw = g.read() - if re.search(r'(?im)

      Subscription Expired

      ', indexraw): - print('Error, wiki subscription expired. Skiping wiki...') - continue - - downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite) - logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite) - - if upload: - itemid = 'wiki-%s' % (wikidomain) - print('\nCompressing dump...') - wikidir = wikidomain - os.chdir(wikidir) - print('Changed directory to', os.getcwd()) - wikizip = '%s.zip' % (wikidomain) - subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True) - os.chdir('..') - print('Changed directory to', os.getcwd()) - - print('\nUploading to Internet Archive...') - indexfilename = '%s/index.html' % (wikidir) - if not os.path.exists(indexfilename): - print('\nError dump incomplete, skipping upload\n') - continue - indexhtml = '' - try: - with open(indexfilename, 'r', encoding='utf-8') as f: - indexhtml = f.read() - except: - with open(indexfilename, 'r', encoding='latin-1') as f: - indexhtml = f.read() - - wikititle = '' - try: - wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip() - except: - wikititle = wikidomain - if not wikititle: - wikititle = wikidomain - wikititle = wikititle.replace("\\'", " ") - wikititle = wikititle.replace('\\"', " ") - itemtitle = 'Wiki - %s' % wikititle - itemdesc = '
      %s dumped with WikiTeam tools.' % (wikiurl, wikititle) - itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain] - itemoriginalurl = wikiurl - itemlicenseurl = '' - m = '' - try: - m = re.findall(r'', indexhtml.split('
      . - -# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki -# Documentation for developers: http://wikiteam.readthedocs.com - -import json -import re -import sys -import urllib - -import wikiteam - -def mwCleanHTML(raw=''): - """ Extract only the real wiki content and remove rubbish """ - """ This function is ONLY used to retrieve page titles and file names when no API is available """ - """ DO NOT use this function to extract page content """ - - # different "tags" used by different MediaWiki versions to mark where - # starts and ends content - if re.search('', raw): - raw = raw.split('')[1].split('')[0] - elif re.search('', raw): - raw = raw.split( - '')[1].split('')[0] - elif re.search('', raw): - raw = raw.split( - '')[1].split('')[0] - elif re.search('', raw): - raw = raw.split('')[1].split('')[0] - elif re.search('
      ', raw): - raw = raw.split('
      ')[1].split('
      ')[0] - elif re.search('')[0] - else: - sys.stderr.write(raw[:250]) - sys.stderr.write('This wiki doesn\'t use marks to split content\n') - sys.exit() - return raw - -def mwCleanXML(xml=''): - """ Trim redundant info """ - - # do not touch XML codification, leave AS IS - if re.search(r'\n', xml): - xml = xml.split('\n')[1] - if re.search(r'', xml): - xml = xml.split('')[0] - return xml - -def mwCreateNewDump(config={}): - sys.stderr.write('Trying generating a new dump into a new directory...') - if config['pages']: - pagetitles = mwGetPageTitles(config=config) - wikiteam.savePageTitles(config=config, pagetitles=pagetitles) - mwGeneratePageDump(config=config, pagetitles=pagetitles) - mwCheckXMLIntegrity(config=config, pagetitles=pagetitles) - if config['images']: - imagenames = mwGetImageNames(config=config) - mwSaveImageNames(config=config, imagenames=imagenames) - mwGenerateImageDump(config=config, imagenames=imagenames) - if config['logs']: - mwSaveLogs(config=config) - mwSaveIndexPHP(config=config) - mwSaveSpecialVersion(config=config) - mwSaveSiteInfo(config=config) - -def mwCurateImageURL(config={}, url=''): - """ Returns an absolute URL for an image, adding the domain if missing """ - - if 'mwindex' in config and config['mwindex']: - # remove from :// (http or https) until the first / after domain - domainalone = config['mwindex'].split( - '://')[0] + '://' + config['mwindex'].split('://')[1].split('/')[0] - elif 'mwapi' in config and config['mwapi']: - domainalone = config['mwapi'].split( - '://')[0] + '://' + config['mwapi'].split('://')[1].split('/')[0] - else: - sys.stderr.write('ERROR: no index nor API') - sys.exit() - - if url.startswith('//'): # Orain wikifarm returns URLs starting with // - url = '%s:%s' % (domainalone.split('://')[0], url) - # is it a relative URL? - elif url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): - if url[0] == '/': # slash is added later - url = url[1:] - # concat http(s) + domain + relative url - url = '%s/%s' % (domainalone, url) - url = wikiteam.undoHTMLEntities(text=url) - # url = urllib.unquote(url) #do not use unquote with url, it break some - # urls with odd chars - url = re.sub(' ', '_', url) - - return url - -def mwGeneratePageDump(config={}, pagetitles=None, start=None): - """ Generates a XML dump for page titles """ - - sys.stderr.write('Retrieving XML for every page from "%s"' % (start or 'start')) - header = mwGetXMLHeader(config=config) - footer = '\n' # new line at the end - xmlfilename = '%s-%s-%s.xml' % (wikiteam.domain2prefix(config=config), - config['date'], - config['curonly'] and 'current' or 'history') - xmlfile = '' - lock = True - if start: - sys.stderr.write("Removing the last chunk of past XML dump: it is probably incomplete.\n") - for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True): - pass - else: - # requested complete xml dump - lock = False - xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') - xmlfile.write(header) - xmlfile.close() - - xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') - c = 1 - for pagetitle in mwGetPageTitles(config=config, start=start): - if not pagetitle.strip(): - continue - if pagetitle == start: # start downloading from start, included - lock = False - if lock: - continue - wikiteam.delay(config=config) - if c % 10 == 0: - sys.stderr.write('Downloaded %d pages\n' % (c)) - try: - for xml in getXMLPage(config=config, title=title): - xml = cleanXML(xml=xml) - xmlfile.write(xml) - except PageMissingError: - logerror( - config=config, - text='The page "%s" was missing in the wiki (probably deleted)' % - (title)) - # here, XML is a correct chunk or - # an empty string due to a deleted page (logged in errors log) or - # an empty string due to an error while retrieving the page from server - # (logged in errors log) - c += 1 - xmlfile.write(footer) - xmlfile.close() - sys.stderr.write('XML dump saved at... %s\n' % (xmlfilename)) - -def mwGetAPI(config={}): - """ Returns API for a MediaWiki wiki, if available """ - - api = '' - html = wikiteam.getURL(url=config['wiki']) - m = re.findall( - r'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>', - html) - if m: - api = m[0] - if api.startswith('//'): # gentoo wiki and others - api = url.split('//')[0] + api - return api - -def mwGetImageNames(config={}): - """ Get list of image names """ - - sys.stderr.write('Retrieving image filenames\n') - imagenames = [] - if 'mwapi' in config and config['mwapi']: - imagenames = mwGetImageNamesAPI(config=config) - elif 'mwindex' in config and config['mwindex']: - imagenames = mwGetImageNamesScraper(config=config) - # imagenames = list(set(imagenames)) # it is a list of lists - imagenames.sort() - sys.stderr.write('%d image names loaded\n' % (len(imagenames))) - return imagenames - -def mwGetImageNamesAPI(config={}): - """ Retrieve file list: filename, url, uploader """ - - oldAPI = False - aifrom = '!' - imagenames = [] - while aifrom: - sys.stderr.write('.') # progress - data = { - 'action': 'query', - 'list': 'allimages', - 'aiprop': 'url|user', - 'aifrom': aifrom, - 'format': 'json', - 'ailimit': 500} - # FIXME Handle HTTP Errors HERE - r = wikiteam.getURL(url=config['mwapi'], data=data) - #handleStatusCode(r) - jsonimages = wikiteam.getJSON(r) - wikiteam.delay(config=config) - - if 'query' in jsonimages: - aifrom = '' - if 'query-continue' in jsonimages and 'allimages' in jsonimages['query-continue']: - if 'aicontinue' in jsonimages['query-continue']['allimages']: - aifrom = jsonimages['query-continue']['allimages']['aicontinue'] - elif 'aifrom' in jsonimages['query-continue']['allimages']: - aifrom = jsonimages['query-continue']['allimages']['aifrom'] - elif 'continue' in jsonimages: - if 'aicontinue' in jsonimages['continue']: - aifrom = jsonimages['continue']['aicontinue'] - elif 'aifrom' in jsonimages['continue']: - aifrom = jsonimages['continue']['aifrom'] - # sys.stderr.write(aifrom) - - for image in jsonimages['query']['allimages']: - url = image['url'] - url = mwCurateImageURL(config=config, url=url) - # encoding to ascii is needed to work around this horrible bug: - # http://bugs.python.org/issue8136 - if 'mwapi' in config and '.wikia.com' in config['mwapi']: - #to avoid latest?cb=20120816112532 in filenames - filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore') - else: - filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore') - uploader = re.sub('_', ' ', image['user']) - imagenames.append([filename, url, uploader]) - else: - oldAPI = True - break - - if oldAPI: - gapfrom = '!' - imagenames = [] - while gapfrom: - sys.stderr.write('.') # progress - # Some old APIs doesn't have allimages query - # In this case use allpages (in nm=6) as generator for imageinfo - # Example: - # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6 - # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=! - data = { - 'action': 'query', - 'generator': 'allpages', - 'gapnamespace': 6, - 'gaplimit': 500, - 'gapfrom': gapfrom, - 'prop': 'imageinfo', - 'iiprop': 'user|url', - 'format': 'json'} - # FIXME Handle HTTP Errors HERE - r = wikiteam.getURL(url=config['mwapi'], data=data) - #handleStatusCode(r) - jsonimages = wikiteam.getJSON(r) - wikiteam.delay(config=config) - - if 'query' in jsonimages: - gapfrom = '' - if 'query-continue' in jsonimages and 'allpages' in jsonimages['query-continue']: - if 'gapfrom' in jsonimages['query-continue']['allpages']: - gapfrom = jsonimages['query-continue']['allpages']['gapfrom'] - - for image, props in jsonimages['query']['pages'].items(): - url = props['imageinfo'][0]['url'] - url = mwCurateImageURL(config=config, url=url) - tmp_filename = ':'.join(props['title'].split(':')[1:]) - filename = re.sub('_', ' ', tmp_filename) - uploader = re.sub('_', ' ', props['imageinfo'][0]['user']) - imagenames.append([filename, url, uploader]) - else: - # if the API doesn't return query data, then we're done - break - - if len(imagenames) == 1: - sys.stderr.write(' Found 1 image') - else: - sys.stderr.write(' Found %d images' % (len(imagenames))) - - return imagenames - -def mwGetImageNamesScraper(config={}): - """ Retrieve file list: filename, url, uploader """ - - # (?\d+)&' - imagenames = [] - offset = '29990101000000' # january 1, 2999 - limit = 5000 - retries = config['retries'] - while offset: - # 5000 overload some servers, but it is needed for sites like this with - # no next links - # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= - data={ - 'title': 'Special:Imagelist', - 'limit': limit, - 'offset': offset} - raw = wikiteam.getURL(url=config['index'], data=data) - #handleStatusCode(r) - wikiteam.delay(config=config) - # delicate wiki - if re.search(r'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): - if limit > 10: - sys.stderr.write('Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)) - limit = limit / 10 - continue - elif retries > 0: # waste retries, then exit - retries -= 1 - sys.stderr.write('Retrying...') - continue - else: - sys.stderr.write('No more retries, exit...') - break - - raw = mwCleanHTML(raw) - # archiveteam 1.15.1
      Yahoovideo.jpg (file) - # wikanda 1.15.5 Fernandocg - r_images1 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+\s*]+>(?P[^<]+)' - # wikijuegos 1.9.5 - # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old - # mediawiki version - r_images2 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+\s*[^<]+\s*[^<]+\s*]+>(?P[^<]+)' - # gentoowiki 1.18 - r_images3 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>]+>[^<]+]+>(?P[^<]+)' - # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= - # (desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
      - r_images4 = r'(?im)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)' - r_images5 = ( - r'(?im)\s*]*?>(?P[^>]+)\s*\([^<]*?\s*\)\s*\s*' - '[^\n\r]*?\s*' - '[^<]*?\s*' - '\s*()?(?P[^<]+?)()?\s*') - - # Select the regexp that returns more results - regexps = [r_images1, r_images2, r_images3, r_images4, r_images5] - count = 0 - i = 0 - regexp_best = 0 - for regexp in regexps: - if len(re.findall(regexp, raw)) > count: - count = len(re.findall(regexp, raw)) - regexp_best = i - i += 1 - m = re.compile(regexps[regexp_best]).finditer(raw) - - # Iter the image results - for i in m: - url = i.group('url') - url = mwCurateImageURL(config=config, url=url) - filename = re.sub('_', ' ', i.group('filename')) - filename = wikiteam.undoHTMLEntities(text=filename) - filename = urllib.unquote(filename) - uploader = re.sub('_', ' ', i.group('uploader')) - uploader = wikiteam.undoHTMLEntities(text=uploader) - uploader = urllib.unquote(uploader) - imagenames.append([filename, url, uploader]) - - if re.search(r_next, raw): - new_offset = re.findall(r_next, raw)[0] - # Avoid infinite loop - if new_offset != offset: - offset = new_offset - retries += 5 # add more retries if we got a page with offset - else: - offset = '' - else: - offset = '' - - if (len(imagenames) == 1): - sys.stderr.write(' Found 1 image') - else: - sys.stderr.write(' Found %d images' % (len(imagenames))) - - imagenames.sort() - return imagenames - -def mwGetIndex(config={}): - """ Returns Index.php for a MediaWiki wiki, if available """ - - if config['mwapi']: - mwapi = config['mwapi'] - else: - mwapi = mwGetAPI(config=config) - index = '' - html = wikiteam.getURL(url=config['wiki']) - m = re.findall(r'
    • ]*?>\s*(?:)?\s*]*?>\s*(?:)?\s*]*? to include selected="selected" - m = re.compile(r'').finditer(raw) - if 'all' in namespaces: - namespaces = [] - for i in m: - namespaces.append(int(i.group("namespaceid"))) - namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") - else: - # check if those namespaces really exist in this wiki - namespaces2 = [] - for i in m: - if int(i.group("namespaceid")) in namespaces: - namespaces2.append(int(i.group("namespaceid"))) - namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") - namespaces = namespaces2 - else: - namespaces = [0] - - namespaces = list(set(namespaces)) # uniques - std.stderr.write('%d namespaces found' % (len(namespaces))) - return namespaces, namespacenames - -def mwGetPageTitles(config={}): - """ Get list of page titles """ - # http://en.wikipedia.org/wiki/Special:AllPages - # http://archiveteam.org/index.php?title=Special:AllPages - # http://www.wikanda.es/wiki/Especial:Todas - sys.stderr.write('Loading page titles from namespaces = %s\n' % (','.join([str(i) for i in config['namespaces']]) or 'None')) - sys.stderr.write('Excluding titles from namespaces = %s\n' % (','.join([str(i) for i in config['exnamespaces']]) or 'None')) - - if 'mwapi' in config and config['mwapi']: - for pagetitle in mwGetPageTitlesAPI(config=config): - yield pagetitle - elif 'mwindex' in config and config['mwindex']: - for pagetitle in mwGetPageTitlesScraper(config=config): - yield pagetitle - -def mwGetPageTitlesAPI(config={}): - """ Uses the API to get the list of page titles """ - pagetitles = [] - namespaces, namespacenames = mwGetNamespacesAPI( - config=config) - for namespace in namespaces: - if namespace in config['exnamespaces']: - sys.stderr.write(' Skipping namespace = %d\n' % (namespace)) - continue - - c = 0 - sys.stderr.write(' Retrieving page titles in namespace %d\n' % (namespace)) - apfrom = '!' - while apfrom: - sys.stderr.write('.') # progress - data = { - 'action': 'query', - 'list': 'allpages', - 'apnamespace': namespace, - 'apfrom': apfrom.encode('utf-8'), - 'format': 'json', - 'aplimit': 500} - retryCount = 0 - while retryCount < config["retries"]: - try: - r = wikiteam.getURL(url=config['mwapi'], data=data) - break - except ConnectionError as err: - sys.stderr.write("Connection error: %s\n" % (str(err),)) - retryCount += 1 - time.sleep(20) - #wikiteam.handleStatusCode(r) - # FIXME Handle HTTP errors here! - jsontitles = wikiteam.getJSON(r) - apfrom = '' - if 'query-continue' in jsontitles and 'allpages' in jsontitles[ - 'query-continue']: - if 'apcontinue' in jsontitles['query-continue']['allpages']: - apfrom = jsontitles[ - 'query-continue']['allpages']['apcontinue'] - elif 'apfrom' in jsontitles['query-continue']['allpages']: - apfrom = jsontitles['query-continue']['allpages']['apfrom'] - elif 'continue' in jsontitles: - if 'apcontinue' in jsontitles['continue']: - apfrom = jsontitles['continue']['apcontinue'] - elif 'apfrom' in jsontitles['continue']: - apfrom = jsontitles['continue']['apfrom'] - - # sys.stderr.write(apfrom) - # sys.stderr.write(jsontitles) - allpages = jsontitles['query']['allpages'] - # Hack for old versions of MediaWiki API where result is dict - if isinstance(allpages, dict): - allpages = allpages.values() - for page in allpages: - yield page['title'] - c += len(allpages) - - if len(pagetitles) != len(set(pagetitles)): - # Are we in a loop? Server returning dupes, stop it - sys.stderr.write('Probably a loop, finishing\n') - apfrom = '' - - wikiteam.delay(config=config) - sys.stderr.write(' %d titles retrieved in namespace %d\n' % (c, namespace)) - - -def mwGetPageTitlesScraper(config={}): - """ Scrape list of page titles from Special:Allpages """ - - pagetitles = [] - namespaces, namespacenames = mwGetNamespacesScraper( - config=config) - for namespace in namespaces: - sys.stderr.write(' Retrieving titles in namespace %s\n' % (namespace)) - url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) - raw = wikiteam.getURL(url=url) - raw = mwCleanHTML(raw) - - r_title = r'title="(?P[^>]+)">' - r_suballpages = '' - r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">' - r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">' - r_suballpages3 = r'&from=(?P<from>[^>]+)" title="[^>]+">' - if re.search(r_suballpages1, raw): - r_suballpages = r_suballpages1 - elif re.search(r_suballpages2, raw): - r_suballpages = r_suballpages2 - elif re.search(r_suballpages3, raw): - r_suballpages = r_suballpages3 - else: - pass # perhaps no subpages - - # 3 is the current deep of English Wikipedia for Special:Allpages - deep = 3 - c = 0 - checked_suballpages = [] - rawacum = raw - while r_suballpages and re.search(r_suballpages, raw) and c < deep: - # load sub-Allpages - m = re.compile(r_suballpages).finditer(raw) - for i in m: - fr = i.group('from') - - if r_suballpages == r_suballpages1: - to = i.group('to') - name = '%s-%s' % (fr, to) - url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % ( - config['index'], namespace, fr, to) # do not put urllib.quote in fr or to - # fix, esta regexp no carga bien todas? o falla el r_title en - # este tipo de subpag? (wikiindex) - elif r_suballpages == r_suballpages2: - # clean &namespace=\d, sometimes happens - fr = fr.split('&namespace=')[0] - name = fr - url = '%s?title=Special:Allpages/%s&namespace=%s' % ( - config['index'], name, namespace) - elif r_suballpages == r_suballpages3: - fr = fr.split('&namespace=')[0] - name = fr - url = '%s?title=Special:Allpages&from=%s&namespace=%s' % ( - config['index'], name, namespace) - - if name not in checked_suballpages: - # to avoid reload dupe subpages links - checked_suballpages.append(name) - wikiteam.delay(config=config) - raw2 = wikiteam.getURL(url=url) - raw2 = mwCleanHTML(raw2) - rawacum += raw2 # merge it after removed junk - sys.stderr.write(' Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \ - len(re.findall(r_suballpages, raw2)), \ - len(re.findall(r_title, raw2)))) - - wikiteam.delay(config=config) - c += 1 - - c = 0 - m = re.compile(r_title).finditer(rawacum) - for i in m: - t = wikiteam.undoHTMLEntities(text=i.group('title')) - if not t.startswith('Special:'): - if t not in pagetitles: - pagetitles.append(t) - c += 1 - sys.stderr.write(' %d titles retrieved in the namespace %d\n' % (c, namespace)) - return pagetitles - -def mwGetXMLHeader(config={}): - """ Retrieve a random page to extract XML header (namespace info, etc) """ - - pagetitle = 'Main_Page' - try: - xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)]) - except PageMissingError as pme: - # The <page> does not exist. Not a problem, if we get the <siteinfo>. - xml = pme.xml - except ExportAbortedError: - # Issue 26: Account for missing "Special" namespace. - # Hope the canonical special name has not been removed. - # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases - try: - if config['mwapi']: - sys.stderr.write("Trying the local name for the Special namespace instead\n") - xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)]) - except PageMissingError as pme: - xml = pme.xml - except ExportAbortedError: - pass - - header = xml.split('</mediawiki>')[0] - if not re.match(r"\s*<mediawiki", xml): - sys.stderr.write('XML export on this wiki is broken, quitting.\n') - logerror('XML export on this wiki is broken, quitting.') - sys.exit() - return header - -def mwGetXMLPage(config={}, pagetitle='', verbose=True): - """ Get the full history (or current only) of a page """ - - # if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated - # http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F - - limit = 1000 - truncated = False - pagetitle_ = re.sub(' ', '_', pagetitle) - # do not convert & into %26, pagetitle_ = re.sub('&', '%26', pagetitle_) - data = {'title': config['mwexport'], 'pages': pagetitle_, 'action': 'submit'} - if config['curonly']: - data['curonly'] = 1 - data['limit'] = 1 - else: - data['offset'] = '1' # 1 always < 2000s - data['limit'] = limit - # in other case, do not set data['templates'] - if 'templates' in config and config['templates']: #fix, what is this option for? - data['templates'] = 1 - - xml = mwGetXMLPageCore(config=config, data=data) - if not xml: - raise ExportAbortedError(config['index']) - if not "</page>" in xml: - raise PageMissingError(data['title'], xml) - else: - # strip these sha1s sums which keep showing up in the export and - # which are invalid for the XML schema (they only apply to - # revisions) - xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml) - xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml) - - yield xml.split("</page>")[0] - - # if complete history, check if this page history has > limit edits, - # if so, retrieve all revisions using offset if available - # else, warning about Special:Export truncating large page histories - r_timestamp = r'<timestamp>([^<]+)</timestamp>' - numedits = 0 - numedits += len(re.findall(r_timestamp, xml)) - - # search for timestamps in xml to avoid analysing empty pages like - # Special:Allpages and the random one - if not config['curonly'] and re.search(r_timestamp, xml): - while not truncated and data['offset']: # next chunk - # get the last timestamp from the acum XML - # assuming history is sorted chronologically - data['offset'] = re.findall(r_timestamp, xml)[-1] - try: - xml2 = mwGetXMLPageCore(config=config, data=data) - except MemoryError: - sys.stderr.write("Page history exceeds our memory, halving limit.\n") - data['limit'] = data['limit'] / 2 - continue - - # are there more edits in this next XML chunk or no <page></page>? - if re.findall(r_timestamp, xml2): - if re.findall(r_timestamp, xml2)[-1] == data['offset']: - # again the same XML, this wiki does not support params in - # Special:Export, offer complete XML up to X edits (usually - # 1000) - sys.stderr.write('ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated\n') - truncated = True - break - else: - """ </namespaces> - </siteinfo> - <page> - <title>Main Page - 15580374 - edit=sysop:move=sysop (?) - - 418009832 - 2011-03-09T19:57:06Z - - """ - # offset is OK in this wiki, merge with the previous chunk - # of this page history and continue - try: - xml2 = xml2.split("")[0] - yield ' ' + (''.join(xml2.split('')[1:])) - except MemoryError: - sys.stderr.write("Page's history exceeds our memory, halving limit.\n") - data['limit'] = data['limit'] / 2 - continue - xml = xml2 - numedits += len(re.findall(r_timestamp, xml)) - else: - data['offset'] = '' # no more edits in this page history - yield "\n" - - if verbose: - if numedits == 1: - sys.stderr.write(' %s, 1 edit\n' % (pagetitle)) - else: - sys.stderr.write(' %s, %d edits\n' % (pagetitle, numedits)) - -def mwGetXMLPageCore(config={}, data={}): - """ Returns a XML containing data['limit'] revisions (or current only), ending in - if retrieving data['limit'] revisions fails, returns current only version - if all fail, returns empty string - """ - - xml = '' - cretries = 0 - maxseconds = 100 # max seconds to wait in a single sleeping - maxretries = config['retries'] # x retries and exit - increment = 20 # increment seconds every retry - - while not re.search(r'', xml): - if cretries > 0 and cretries < maxretries: - wait = increment * cretries < maxseconds and increment * \ - cretries or maxseconds # incremental until maxseconds - sys.stderr.write(' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...\n' % (c, data['pages'], wait)) - time.sleep(wait) - # reducing server load requesting smallest chunks (if curonly then - # limit = 1 from mother function) - if data['limit'] > 1: - data['limit'] = data['limit'] / 2 # half - if cretries >= maxretries: - sys.stderr.write(' We have retried %d times\n' % (cretries)) - sys.stderr.write(' MediaWiki error for "%s", probably network error...' % (data['pages'])) - # If it's not already what we tried: our last chance, preserve only the last revision... - # config['curonly'] means that the whole dump is configured to save only the last, - # data['curonly'] should mean that we've already tried this - # fallback, because it's set by the following if and passed to - # mwGetXMLPageCore - if not config['curonly'] and not 'curonly' in data: - sys.stderr.write(' Trying to save only the last revision for this page...\n') - data['curonly'] = 1 - logerror( - config=config, - text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % - (data['pages']) - ) - return mwGetXMLPageCore(config=config, data=data) - else: - sys.stderr.write(' Saving in error log, skipping...\n') - logerror( - config=config, - text='Error while retrieving last revision of "%s". Skipping.\n' % - (data['pages'])) - raise ExportAbortedError(config['index']) - return '' # empty xml - # FIXME HANDLE HTTP Errors HERE - try: - r = wikiteam.getURL(url=config['index'], data=data) - #handleStatusCode(r) - #r = fixBOM(r) - xml = fixBOM(r) - except: - sys.stderr.write(' Connection error\n') - xml = '' - cretries += 1 - - return xml - -def mwReadPageTitles(config={}, start=None): - """ Read title list from a file, from the title "start" """ - - titlesfilename = '%s-%s-titles.txt' % ( - domain2prefix(config=config), config['date']) - titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r') - - seeking = False - if start: - seeking = True - - with titlesfile as f: - for line in f: - if line.strip() == '--END--': - break - elif seeking and line.strip() != start: - continue - elif seeking and line.strip() == start: - seeking = False - yield line.strip() - else: - yield line.strip() - -def mwRemoveIP(raw=''): - """ Remove IP from HTML comments """ - - raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw) - # http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html - # weird cases as :: are not included - raw = re.sub( - r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', - '0:0:0:0:0:0:0:0', - raw) - - return raw - -def mwResumePreviousDump(config={}): - imagenames = [] - sys.stderr.write('Resuming previous dump process...') - if config['xml']: - pagetitles = mwReadPageTitles(config=config) - try: - lasttitles = wikiteam.reverseReadline('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date'])) - lasttitle=lasttitles.next() - if lasttitle == '': - lasttitle=lasttitles.next() - except: - pass # probably file does not exists - if lasttitle == '--END--': - # titles list is complete - sys.stderr.write('Title list was completed in the previous session') - else: - sys.stderr.write('Title list is incomplete. Reloading...') - # do not resume, reload, to avoid inconsistences, deleted pages or - # so - pagetitles = mwGetPageTitles(config=config, start=lastxmltitle) - wikiteam.savePageTitles(config=config, pagetitles=pagetitles) - - # checking xml dump - xmliscomplete = False - lastxmltitle = None - try: - f = wikiteam.reverseReadline( - '%s/%s-%s-%s.xml' % - (config['path'], - domain2prefix( - config=config), - config['date'], - config['curonly'] and 'current' or 'history'), - ) - for l in f: - if l == '': - # xml dump is complete - xmliscomplete = True - break - - xmltitle = re.search(r'([^<]+)', l) - if xmltitle: - lastxmltitle = wikiteam.undoHTMLEntities(text=xmltitle.group(1)) - break - except: - pass # probably file does not exists - - if xmliscomplete: - sys.stderr.write('XML dump was completed in the previous session') - elif lastxmltitle: - # resuming... - sys.stderr.write('Resuming XML dump from "%s"' % (lastxmltitle)) - pagetitles = mwReadPageTitles(config=config, start=lastxmltitle) - mwGenerateXMLDump( - config=config, - pagetitles=pagetitles, - start=lastxmltitle) - else: - # corrupt? only has XML header? - sys.stderr.write('XML is corrupt? Regenerating...') - pagetitles = mwReadPageTitles(config=config) - mwGenerateXMLDump(config=config, pagetitles=pagetitles) - - if config['images']: - # load images - lastimage = '' - try: - f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') - raw = f.read().strip() - lines = raw.split('\n') - for l in lines: - if re.search(r'\t', l): - imagenames.append(l.split('\t')) - lastimage = lines[-1] - f.close() - except: - pass # probably file doesnot exists - if lastimage == '--END--': - sys.stderr.write('Image list was completed in the previous session') - else: - sys.stderr.write('Image list is incomplete. Reloading...') - # do not resume, reload, to avoid inconsistences, deleted images or - # so - imagenames = mwGetImageNames(config=config) - saveImageNames(config=config, imagenames=imagenames) - # checking images directory - listdir = [] - try: - listdir = [n.decode('utf-8') for n in os.listdir('%s/images' % (config['path']))] - except: - pass # probably directory does not exist - listdir.sort() - complete = True - lastfilename = '' - lastfilename2 = '' - c = 0 - for filename, url, uploader in imagenames: - lastfilename2 = lastfilename - # return always the complete filename, not the truncated - lastfilename = filename - filename2 = filename - if len(filename2) > other['filenamelimit']: - filename2 = truncateFilename(other=other, filename=filename2) - if filename2 not in listdir: - complete = False - break - c += 1 - sys.stderr.write('%d images were found in the directory from a previous session' % (c)) - if complete: - # image dump is complete - sys.stderr.write('Image dump was completed in the previous session') - else: - # we resume from previous image, which may be corrupted (or missing - # .desc) by the previous session ctrl-c or abort - mwGenerateImageDump(config=config, imagenames=imagenames, start=lastfilename2) - - if config['logs']: - # fix - pass - - mwSaveIndexPHP(config=config) - mwSaveSpecialVersion(config=config) - mwSaveSiteInfo(config=config) - -def mwSaveIndexPHP(config={}): - """ Save index.php as .html, to preserve license details available at the botom of the page """ - - if os.path.exists('%s/index.html' % (config['path'])): - sys.stderr.write('index.html exists, do not overwrite') - else: - sys.stderr.write('Downloading index.php (Main Page) as index.html') - raw = wikiteam.getURL(url=config['index'], data={}) - wikiteam.delay(config=config) - raw = mwRemoveIP(raw=raw) - with open('%s/index.html' % (config['path']), 'w') as outfile: - outfile.write(raw) - -def mwSaveSiteInfo(config={}): - """ Save a file with site info """ - - if config['api']: - if os.path.exists('%s/siteinfo.json' % (config['path'])): - sys.stderr.write('siteinfo.json exists, do not overwrite') - else: - sys.stderr.write('Downloading site info as siteinfo.json') - - # MediaWiki 1.13+ - raw = wikiteam.getURL(url=config['api'], data={ - 'action': 'query', - 'meta': 'siteinfo', - 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo', - 'sinumberingroup': 1, - 'format': 'json'}) - wikiteam.delay(config=config) - # MediaWiki 1.11-1.12 - if not 'query' in wikiteam.getJSON(raw): - raw = wikiteam.getURL(url=config['api'], data={ - 'action': 'query', - 'meta': 'siteinfo', - 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap', - 'format': 'json'}) - # MediaWiki 1.8-1.10 - if not 'query' in wikiteam.getJSON(raw): - raw = wikiteam.getURL(url=config['api'], data={ - 'action': 'query', - 'meta': 'siteinfo', - 'siprop': 'general|namespaces', - 'format': 'json'}) - result = wikiteam.getJSON(raw) - wikiteam.delay(config=config) - with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: - outfile.write(json.dumps(result, indent=4, sort_keys=True)) - -def mwSaveSpecialVersion(config={}): - """ Save Special:Version as .html, to preserve extensions details """ - - if os.path.exists('%s/Special:Version.html' % (config['path'])): - sys.stderr.write('Special:Version.html exists, do not overwrite') - else: - sys.stderr.write('Downloading Special:Version with extensions and other related info') - raw = wikiteam.getURL(url=config['index'], data={'title': 'Special:Version'}) - wikiteam.delay(config=config) - raw = mwRemoveIP(raw=raw) - with open('%s/Special:Version.html' % (config['path']), 'w') as outfile: - outfile.write(raw) - -def main(): - pass - -if __name__ == "__main__": - main() diff --git a/wikiteam/wikispaces.py b/wikiteam/wikispaces.py deleted file mode 100644 index 976f5494..00000000 --- a/wikiteam/wikispaces.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Copyright (C) 2011-2016 WikiTeam developers -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki -# Documentation for developers: http://wikiteam.readthedocs.com - -import re - -import wikiteam - -def wsCreateNewDump(config={}): - pass - -def wsGetPageTitles(config={}): - pass - -def main(): - pass - -if __name__ == "__main__": - main() diff --git a/wikiteam/wikiteam.py b/wikiteam/wikiteam.py deleted file mode 100644 index 560cd470..00000000 --- a/wikiteam/wikiteam.py +++ /dev/null @@ -1,792 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# Copyright (C) 2011-2016 WikiTeam developers -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki -# Documentation for developers: http://wikiteam.readthedocs.com - -import argparse -import datetime -import http.cookiejar as cookielib -import json -import os -import pickle as cPickle -import random -import re -import sys -import time -import urllib - -__version__ = "0.3.1" - -""" -Stuff to check if works properly or re-add if needed: -* fixBOM -* sessions -""" - -def avoidWikimediaProjects(config={}): - """ Skip Wikimedia projects and redirect to the dumps website """ - - # notice about wikipedia dumps - if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['wiki']): - sys.stderr.write('PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!\n') - sys.stderr.write('Download Wikimedia dumps from https://dumps.wikimedia.org\n') - """if not other['force']: - sys.stderr.write('Thanks!') - sys.exit()""" - -def bye(): - """ Print closing message """ - - message = """ ----> Congratulations! Your dump is complete <--- -If you found any bug, report a new issue here: https://github.com/WikiTeam/wikiteam/issues -If this is a public wiki, please consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam -Good luck! Bye!""" - sys.stderr.write(message) - -def createNewDump(config={}): - if config['wikiengine'] == 'mediawiki': - import mediawiki - mediawiki.mwCreateNewDump(config=config) - elif config['wikiengine'] == 'wikispaces': - import wikispaces - wikispaces.wsCreateNewDump(config=config) - else: - sys.stderr.write("Wikiengine %s not supported. Exiting.\n" % (config['wikiengine'])) - -def createDumpPath(config={}): - # creating path or resuming if desired - c = 2 - # to avoid concat blabla-2, blabla-2-3, and so on... - originalpath = config['path'] - # do not enter if resume is requested from begining - while not config['other']['resume'] and os.path.isdir(config['path']): - sys.stderr.write('\nWarning!: "%s" path exists\n' % (config['path'])) - reply = '' - while reply.lower() not in ['yes', 'y', 'no', 'n']: - reply = input( - 'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' % - (config['path'], - config['path'], - config['other']['configfilename'])) - if reply.lower() in ['yes', 'y']: - if not os.path.isfile('%s/%s' % (config['path'], config['other']['configfilename'])): - sys.stderr.write('No config file found. I can\'t resume. Aborting.\n') - sys.exit() - sys.stderr.write('You have selected: YES\n') - config['other']['resume'] = True - break - elif reply.lower() in ['no', 'n']: - sys.stderr.write('You have selected: NO\n') - config['other']['resume'] = False - config['path'] = '%s-%d' % (originalpath, c) - sys.stderr.write('Trying to use path "%s"...\n' % (config['path'])) - c += 1 - return config - -def delay(config={}): - """ Add a delay if configured for that """ - if config['delay'] > 0: - sys.stderr.write('Sleeping... %d seconds...\n' % (config['delay'])) - time.sleep(config['delay']) - -def domain2prefix(config={}): - """ Convert domain name to a valid prefix filename. """ - - domain = '' - if config['wiki']: - domain = config['wiki'] - domain = domain.lower() - domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain) - domain = re.sub(r'/', '_', domain) - domain = re.sub(r'\.', '', domain) - domain = re.sub(r'[^A-Za-z0-9]', '_', domain) - domain = domain.strip('_') - return domain - -def getAPI(config={}): - """ Returns API for a wiki, if available """ - - api = '' - if config['wikiengine'] == 'mediawiki': - import mediawiki - api = mediawiki.mwGetAPI(config=config) - - return api - -def getImageNames(config={}): - """ Returns list of image names for this wiki """ - - imagenames = [] - if config['wikiengine'] == 'mediawiki': - import mediawiki - imagenames = mediawiki.mwGetImageNames(config=config) - - return imagenames - -def getIndex(config={}): - """ Returns Index.php for a wiki, if available """ - - index = '' - if config['wikiengine'] == 'mediawiki': - import mediawiki - index = mediawiki.mwGetIndex(config=config) - - return index - -def getJSON(request): - """Strip Unicode BOM""" - """if request.text.startswith(u'\ufeff'): - request.encoding = 'utf-8-sig' - return request.json()""" - return json.loads(request) - -def getNamespaces(config={}): - """ Returns list of namespaces for this wiki """ - - namespaces = [] - namespacenames = [] - if config['wikiengine'] == 'mediawiki': - import mediawiki - namespaces, namespacenames = mediawiki.mwGetNamespaces(config=config) - - return namespacenames - -def getPageTitles(config={}): - """ Returns page titles for this wiki """ - - if config['wikiengine'] == 'mediawiki': - import mediawiki - for pagetitle in mediawiki.mwGetPageTitles(config=config): - yield pagetitle - -def getParameters(params=[]): - """ Import parameters into variable """ - - if not params: - params = sys.argv - - config = {} - parser = argparse.ArgumentParser(description='Tools for downloading and preserving wikis.') - - # General params - parser.add_argument( - '-v', '--version', action='version', version=getVersion()) - parser.add_argument( - '--cookies', metavar="cookies.txt", help="Path to a cookies.txt file.") - parser.add_argument( - '--delay', - metavar=5, - default=0, - type=float, - help="Adds a delay (in seconds).") - parser.add_argument( - '--retries', - metavar=5, - default=5, - help="Maximum number of retries.") - parser.add_argument('--path', help='Path to store wiki dump at.') - parser.add_argument( - '--resume', - action='store_true', - help='Resumes previous incomplete dump (requires --path).') - parser.add_argument('--force', action='store_true', help='') - parser.add_argument( - '--user', help='Username if authentication is required.') - parser.add_argument( - '--pass', - dest='password', - help='Password if authentication is required.') - - # URL params - # This script should work with any general URL, finding out - # API, index.php or whatever by itself when necessary - groupWiki = parser.add_argument_group() - groupWiki.add_argument( - 'wiki', - default='', - nargs='?', - help="URL to wiki (e.g. http://wiki.domain.org).") - # URL params for MediaWiki - groupWiki.add_argument( - '--mwapi', - help="URL to MediaWiki API (e.g. http://wiki.domain.org/w/api.php).") - groupWiki.add_argument( - '--mwindex', - help="URL to MediaWiki index.php (e.g. http://wiki.domain.org/w/index.php).") - - # Download params - groupDownload = parser.add_argument_group( - 'Data to download', - 'What info download from the wiki') - groupDownload.add_argument( - '--pages', - action='store_true', - help="Generates a dump of pages (--pages --curonly for current revisions only).") - groupDownload.add_argument('--curonly', action='store_true', - help='Store only the current version of pages.') - groupDownload.add_argument( - '--images', action='store_true', help="Generates an image dump.") - groupDownload.add_argument( - '--namespaces', - metavar="1,2,3", - help='Comma-separated value of namespaces to include (all by default).') - groupDownload.add_argument( - '--exnamespaces', - metavar="1,2,3", - help='Comma-separated value of namespaces to exclude.') - - # Meta info params - groupMeta = parser.add_argument_group( - 'Meta info', - 'What meta info to retrieve from the wiki') - groupMeta.add_argument( - '--get-api', - action='store_true', - help="Returns wiki API when available.") - groupMeta.add_argument( - '--get-index', - action='store_true', - help="Returns wiki Index.php when available.") - groupMeta.add_argument( - '--get-page-titles', - action='store_true', - help="Returns wiki page titles.") - groupMeta.add_argument( - '--get-image-names', - action='store_true', - help="Returns wiki image names.") - groupMeta.add_argument( - '--get-namespaces', - action='store_true', - help="Returns wiki namespaces.") - groupMeta.add_argument( - '--get-wiki-engine', - action='store_true', - help="Returns wiki engine.") - - args = parser.parse_args() - #sys.stderr.write(args) - - # Not wiki? Exit - if not args.wiki: - sys.stderr.write('ERROR: Provide a URL to a wiki\n') - parser.print_help() - sys.exit(1) - - # Don't mix download params and meta info params - if (args.pages or args.images) and \ - (args.get_api or args.get_index or args.get_page_titles or args.get_image_names or args.get_namespaces or args.get_wiki_engine): - sys.stderr.write('ERROR: Don\'t mix download params and meta info params\n') - parser.print_help() - sys.exit(1) - - # No download params and no meta info params? Exit - if (not args.pages and not args.images) and \ - (not args.get_api and not args.get_index and not args.get_page_titles and not args.get_image_names and not args.get_namespaces and not args.get_wiki_engine): - sys.stderr.write('ERROR: Use at least one download param or meta info param\n') - parser.print_help() - sys.exit(1) - - # Load cookies - cj = cookielib.MozillaCookieJar() - if args.cookies: - cj.load(args.cookies) - sys.stderr.write('Using cookies from %s\n' % args.cookies) - - # check user and pass (one requires both) - if (args.user and not args.password) or (args.password and not args.user): - sys.stderr.write('ERROR: Both --user and --pass are required for authentication.\n') - parser.print_help() - sys.exit(1) - - session = None - if args.user and args.password: - import requests - session = requests.Session() - session.cookies = cj - session.headers.update({'User-Agent': getUserAgent()}) - session.auth = (args.user, args.password) - #session.mount(args.mw_api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret)) Mediawiki-centric, be careful - - # check URLs - for url in [args.mwapi, args.mwindex, args.wiki]: - if url and (not url.startswith('http://') and not url.startswith('https://')): - sys.stderr.write(url) - sys.stderr.write('ERROR: URLs must start with http:// or https://\n') - parser.print_help() - sys.exit(1) - - # Meta info params - metainfo = '' # only one allowed, so we don't mix output - if args.get_api: - metainfo = 'get_api' - elif args.get_index: - metainfo = 'get_index' - elif args.get_page_titles: - metainfo = 'get_page_titles' - elif args.get_image_names: - metainfo = 'get_image_names' - elif args.get_namespaces: - metainfo = 'get_namespaces' - elif args.get_wiki_engine: - metainfo = 'get_wiki_engine' - - namespaces = ['all'] - exnamespaces = [] - # Process namespace inclusions - if args.namespaces: - # fix, why - ? and... --namespaces= all with a space works? - if re.search( - r'[^\d, \-]', - args.namespaces) and args.namespaces.lower() != 'all': - sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n") - sys.exit() - else: - ns = re.sub(' ', '', args.namespaces) - if ns.lower() == 'all': - namespaces = ['all'] - else: - namespaces = [int(i) for i in ns.split(',')] - - # Process namespace exclusions - if args.exnamespaces: - if re.search(r'[^\d, \-]', args.exnamespaces): - sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n") - sys.exit(1) - else: - ns = re.sub(' ', '', args.exnamespaces) - if ns.lower() == 'all': - sys.stderr.write('You cannot exclude all namespaces.\n') - sys.exit(1) - else: - exnamespaces = [int(i) for i in ns.split(',')] - - # --curonly requires --xml - if args.curonly and not args.pages: - sys.stderr.write("--curonly requires --pages\n") - parser.print_help() - sys.exit(1) - - config = { - 'cookies': args.cookies or '', - 'curonly': args.curonly, - 'date': datetime.datetime.now().strftime('%Y%m%d'), - 'delay': args.delay, - 'exnamespaces': exnamespaces, - 'images': args.images, - 'logs': False, - 'metainfo': metainfo, - 'namespaces': namespaces, - 'pages': args.pages, - 'path': args.path and os.path.normpath(args.path) or '', - 'retries': int(args.retries), - 'wiki': args.wiki, - 'wikicanonical': '', - 'wikiengine': getWikiEngine(args.wiki), - 'other': { - 'configfilename': 'config.txt', - 'filenamelimit': 100, # do not change - 'force': args.force, - 'resume': args.resume, - 'session': session, - } - } - - # Get ready special variables (API for MediWiki, etc) - if config['wikiengine'] == 'mediawiki': - import mediawiki - config['mwexport'] = 'Special:Export' - if not args.mwapi: - config['mwapi'] = mediawiki.mwGetAPI(config=config) - if not config['mwapi']: - sys.stderr.write('ERROR: Provide a URL to API\n') - sys.exit(1) - else: - data={ - 'action': 'query', - 'meta': 'siteinfo', - 'siprop': 'namespaces', - 'format': 'json'} - r = getURL(config['mwapi'], data=data) - config['mwexport'] = getJSON(r)['query']['namespaces']['-1']['*'] \ - + ':Export' - if not args.mwindex: - config['mwindex'] = mediawiki.mwGetIndex(config=config) - if not config['mwindex']: - sys.stderr.write('ERROR: Provide a URL to Index.php\n') - sys.exit(1) - elif wikiengine == 'wikispaces': - import wikispaces - # use wikicanonical for base url for Wikispaces? - - # calculating path, if not defined by user with --path= - if not config['path']: - config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config), config['date']) - - return config - -def getURL(url='', data=None): - # fix quizas pasandole el config pueda saber si esta definido el campo session y usarlo si interesa con un if - html = '' - try: - req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' }) - if data: - data = urllib.parse.urlencode(data).encode() - html = urllib.request.urlopen(req, data=data).read().decode().strip() - else: - html = urllib.request.urlopen(req).read().decode().strip() - except: - sys.stderr.write("Error while retrieving URL: %s\n" % url) - if data: - sys.stderr.write("Data sent: %s\n" % data) - sys.exit() - return html - -def getUserAgent(): - """ Return a cool user-agent to hide Python user-agent """ - - useragents = [ - 'Mozilla/5.0', - ] - return random.choice(useragents) - -def getVersion(): - return __version__ - -def getWikiEngine(url=''): - """ Returns wiki engine of a URL, if known """ - - wikiengine = 'unknown' - if url: - html = getURL(url=url) - else: - return wikiengine.lower() - - if re.search( - r'(?im)()', html): - wikiengine = 'moinmoin' - elif re.search(r'(?im)(twikiCurrentTopicLink|twikiCurrentWebHomeLink|twikiLink)', html): - wikiengine = 'twiki' - elif re.search(r'(?im)()', html): - wikiengine = 'pmwiki' - elif re.search(r'(?im)(|)', html): - wikiengine = 'wagn' - elif re.search(r'(?im)(\s*(

      )?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")', html): - wikiengine = 'jspwiki' - elif re.search(r'(?im)(Powered by:?\s*(
      )?\s*
      |\bKwikiNavigation\b)', html): - wikiengine = 'kwiki' - elif re.search(r'(?im)(Powered by )', html): - wikiengine = 'zwiki' - # WakkaWiki forks - elif re.search(r'(?im)()', html): - wikiengine = 'wikkawiki' # formerly WikkaWakkaWiki - elif re.search(r'(?im)(CitiWiki)', html): - wikiengine = 'citiwiki' - elif re.search(r'(?im)(Powered by |wikidot-privacy-button-hovertip|javascript:WIKIDOT\.page)', html): - wikiengine = 'wikidot' - elif re.search(r'(?im)(IS_WETPAINT_USER|wetpaintLoad|WPC-bodyContentContainer)', html): - wikiengine = 'wetpaint' - elif re.search(r'(?im)(

    • ]*?>\s*(?:)?\s*]*?>\s*(?:)?\s* len( + re.findall(r"/index\.php\?", result) + ): + index = "/".join(api.split("/")[:-1]) + "/index.php5" + else: + index = "/".join(api.split("/")[:-1]) + "/index.php" + + if not api and index: + api = urljoin(index, "api.php") + + # remove multiple slashes + # https://romancewiki.bham.ac.uk//api.php -> https://romancewiki.bham.ac.uk/api.php + # log: https://cdn.digitaldragon.dev/wikibot/jobs/a1847c8b-f01c-4533-8692-579f11da9c94/log.txt + # log: https://cdn.digitaldragon.dev/wikibot/jobs/4f18485c-e40c-4dcf-9a6d-6d20bf8a82f5/log.txt + if api: + api = re.sub(r"([^:])//+", r"\1/", api) + if index: + index = re.sub(r"([^:])//+", r"\1/", index) + + return api, index + + +def check_retry_API(api: str, apiclient=False, *, session: requests.Session): + """Call check_API and mwclient if necessary""" + check = None + try: + check = check_API(api, session=session) + except requests.exceptions.ConnectionError as e: + print("Connection error: %s" % (str(e))) + + if check and apiclient: + apiurl = urlparse(api) + try: + mwclient.Site( + apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, pool=session + ) + except KeyError: + # Probably KeyError: 'query' + if apiurl.scheme == "https": + newscheme = "http" + api = api.replace("https://", "http://") + else: + newscheme = "https" + api = api.replace("http://", "https://") + print( + "WARNING: The provided API URL did not work with mwclient. Switched protocol to: {}".format( + newscheme + ) + ) + + try: + mwclient.Site( + apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=newscheme, pool=session + ) + except KeyError: + check = False + + return check, api diff --git a/wikiteam3/dumpgenerator/api/get_json.py b/wikiteam3/dumpgenerator/api/get_json.py new file mode 100644 index 00000000..4cb9fc1d --- /dev/null +++ b/wikiteam3/dumpgenerator/api/get_json.py @@ -0,0 +1,14 @@ +import requests + + +def get_JSON(request: requests.Response): + """Strip Unicode BOM""" + if request.text.startswith("\ufeff"): + request.encoding = "utf-8-sig" + # request.encoding = request.apparent_encoding + try: + return request.json() + except Exception: + # Maybe an older API version which did not return correct JSON + print("Error: Could not parse JSON") + return {} diff --git a/wikiteam3/dumpgenerator/api/handle_status_code.py b/wikiteam3/dumpgenerator/api/handle_status_code.py new file mode 100644 index 00000000..edf7d329 --- /dev/null +++ b/wikiteam3/dumpgenerator/api/handle_status_code.py @@ -0,0 +1,37 @@ +import sys + +import requests + + +def handle_StatusCode(response: requests.Response): + status_code = response.status_code + if status_code >= 200 and status_code < 300: + return + + print("HTTP Error %d." % status_code) + if status_code >= 300 and status_code < 400: + print("Redirect should happen automatically: please report this as a bug.") + print(response.url) + + elif status_code == 400: + print("Bad Request: The wiki may be malfunctioning.") + print("Please try again later.") + print(response.url) + sys.exit(1) + + elif status_code == 401 or status_code == 403: + print("Authentication required.") + print("Please use --user and --pass.") + print(response.url) + sys.exit(1) + + elif status_code == 404: + print("Not found. Is Special:Export enabled for this wiki?") + print(response.url) + sys.exit(1) + + elif status_code == 429 or (status_code >= 500 and status_code < 600): + print("Server error, max retries exceeded.") + print("Please resume the dump later.") + print(response.url) + sys.exit(1) diff --git a/wikiteam3/dumpgenerator/api/index_check.py b/wikiteam3/dumpgenerator/api/index_check.py new file mode 100644 index 00000000..a7d477e0 --- /dev/null +++ b/wikiteam3/dumpgenerator/api/index_check.py @@ -0,0 +1,98 @@ +import re +import time + +import requests + +class print_probability: + """ Decorator for printing the return value of a function """ + + def __init__(self, func): + self.func = func + + def __call__(self, *args, **kwargs): + ret = self.func(*args, **kwargs) + print(f"index.php available probability: {ret*100:.0f}% ({ret})") + return ret + +@print_probability +def check_index(*, index: str, logged_in: bool, session: requests.Session) -> float: + """ Checking index.php availability + + returns: + the probability of index.php being available. + * [0.0 - 0.5) - not available + * 0.5 - generally not sure + * (0.5 - 1] - available + """ + + print("Checking index.php...", index) + + r = None + + for page in [ + "Special:Random", + "Special:Version", + "Special:AllPages", + "Special:ListFiles", + "Special:Search", + ]: + print(f"check_index(): Trying {page}...") + + try: + try: + r = session.post(url=index, data={"title": page}, timeout=30, allow_redirects=True) + except requests.exceptions.TooManyRedirects: + r = session.post(url=index, params={"title": page}, timeout=30, allow_redirects=False) + except Exception as e: + print("check_index(): Exception:", e) + time.sleep(2) + continue + + for _r in r.history: + print(_r.request.method, _r.url, {"title": page}, _r.status_code) + print(r.request.method, r.url, {"title": page}, r.status_code) + + if r.status_code in [301, 302, 303, 307, 308]: + print("The index.php returned a redirect") + continue + + if r.status_code >= 400: + print(f"ERROR: The wiki returned status code HTTP {r.status_code}") + continue + + break + + if r is None: + print("ERROR: Failed to get index.php") + return 0.15 + + if r.status_code in [301, 302, 303, 307, 308]: + print("The index.php returned a redirect") + return 0.3 + + raw = r.text + # Workaround for + # [Don't try to download private wikis unless --cookies is given] + # (https://github.com/wikiTeam/wikiteam/issues/71) + if ( + re.search( + '(Special:Badtitle|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required)', + raw, + ) + and not logged_in + ): + print("ERROR: This wiki requires login and we are not authenticated") + return 0.5 + if re.search( + '(page-Index_php|"wgPageName":"Index.php"|"firstHeading">Index.php)', + raw, + ): + print("Looks like the page called Index.php, not index.php itself") + return 0.1 + if re.search( + '(This wiki is powered by|

      |meta name="generator" content="MediaWiki|class="mediawiki)', + raw, + ): + return 0.9 + + return 0.2 diff --git a/wikiteam3/dumpgenerator/api/namespaces.py b/wikiteam3/dumpgenerator/api/namespaces.py new file mode 100644 index 00000000..05fb3358 --- /dev/null +++ b/wikiteam3/dumpgenerator/api/namespaces.py @@ -0,0 +1,98 @@ +import re + +import requests + +from wikiteam3.dumpgenerator.cli import Delay +from wikiteam3.dumpgenerator.api import get_JSON +from wikiteam3.dumpgenerator.config import Config +from wikiteam3.utils.util import ALL_NAMESPACE_FLAG + +def getNamespacesScraper(config: Config, session: requests.Session): + """Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages""" + """Function called if no API is available""" + namespaces = config.namespaces + namespacenames = {0: ""} # main is 0, no prefix + if namespaces: + r = session.post( + url=config.index, params={"title": "Special:Allpages"}, timeout=30 + ) + raw = r.text + Delay(config=config) + + # [^>]*? to include selected="selected" + m = re.compile( + r'' + ).finditer(raw) + if ALL_NAMESPACE_FLAG in namespaces: + namespaces = [] + for i in m: + namespaces.append(int(i.group("namespaceid"))) + namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") + else: + # check if those namespaces really exist in this wiki + namespaces2 = [] + for i in m: + if int(i.group("namespaceid")) in namespaces: + namespaces2.append(int(i.group("namespaceid"))) + namespacenames[int(i.group("namespaceid"))] = i.group( + "namespacename" + ) + namespaces = namespaces2 + else: + namespaces = [0] + + namespaces = list(set(namespaces)) # uniques + print("%d namespaces found" % (len(namespaces))) + return namespaces, namespacenames + + +def getNamespacesAPI(config: Config, session: requests.Session): + """Uses the API to get the list of namespaces names and ids""" + namespaces = config.namespaces + namespace_names = {0: ""} # main is 0, no prefix + if namespaces: + r = session.get( + url=config.api, + params={ + "action": "query", + "meta": "siteinfo", + "siprop": "namespaces", + "format": "json", + }, + timeout=30, + ) + result = get_JSON(r) + Delay(config=config) + try: + nsquery = result["query"]["namespaces"] + except KeyError: + print("Error: could not get namespaces from the API request.") + print("HTTP %d" % r.status_code) + print(r.text) + raise + + if ALL_NAMESPACE_FLAG in namespaces: + namespaces = [] + for i in nsquery.keys(): + if int(i) < 0: # -1: Special, -2: Media, excluding + continue + namespaces.append(int(i)) + namespace_names[int(i)] = nsquery[i]["*"] + else: + # check if those namespaces really exist in this wiki + namespaces2 = [] + for i in nsquery.keys(): + bi = i + i = int(i) + if i < 0: # -1: Special, -2: Media, excluding + continue + if i in namespaces: + namespaces2.append(i) + namespace_names[i] = nsquery[bi]["*"] + namespaces = namespaces2 + else: + namespaces = [0] + + namespaces = list(set(namespaces)) # uniques + print("%d namespaces found" % (len(namespaces))) + return namespaces, namespace_names diff --git a/wikiteam3/dumpgenerator/api/page_titles.py b/wikiteam3/dumpgenerator/api/page_titles.py new file mode 100644 index 00000000..3006f5fc --- /dev/null +++ b/wikiteam3/dumpgenerator/api/page_titles.py @@ -0,0 +1,288 @@ +import re +import traceback +from typing import Generator, Iterator, Optional +from urllib.parse import urlparse + +import mwclient +import mwclient.page +import requests +from file_read_backwards import FileReadBackwards + +from wikiteam3.dumpgenerator.api.namespaces import ( + getNamespacesAPI, + getNamespacesScraper, +) +from wikiteam3.dumpgenerator.cli import Delay +from wikiteam3.dumpgenerator.config import Config +from wikiteam3.utils import clean_HTML, undo_HTML_entities, url2prefix_from_config +from wikiteam3.utils.monkey_patch import SessionMonkeyPatch + + +def getPageTitlesAPI(config: Config, session: requests.Session): + """Uses the API to get the list of page titles""" + titles = [] + namespaces, namespacenames = getNamespacesAPI(config=config, session=session) + + # apply delay to the session for mwclient.Site.allpages() + delay_session = SessionMonkeyPatch( + session=session, config=config, + add_delay=True, delay_msg="Session delay: "+__name__, + hard_retries=3 # TODO: --hard-retries + ) + delay_session.hijack() + for namespace in namespaces: + c = 0 + print(" Retrieving titles in the namespace %d" % (namespace)) + apiurl = urlparse(config.api) + site = mwclient.Site( + host=apiurl.netloc, + path=apiurl.path.replace("api.php", ""), + scheme=apiurl.scheme, + pool=session + ) + for page in site.allpages(namespace=namespace): + assert isinstance(page, mwclient.page.Page) + title = page.name + titles.append(title) + c += 1 + yield title + + if len(titles) != len(set(titles)): + print("Probably a loop, switching to next namespace") + titles = list(set(titles)) + + delay_session.release() + + +def getPageTitlesScraper(config: Config, session: requests.Session): + """Scrape the list of page titles from Special:Allpages""" + titles = [] + namespaces, namespacenames = getNamespacesScraper(config=config, session=session) + for namespace in namespaces: + print(" Retrieving titles in the namespace", namespace) + url = "{}?title=Special:Allpages&namespace={}".format( + config.index, namespace + ) + r = session.get(url=url, timeout=30) + raw = r.text + raw = clean_HTML(raw) + + r_title = r'title="(?P[^>]+)">' + r_suballpages = "" + r_suballpages1 = r'&from=(?P<from>[^>"]+)&to=(?P<to>[^>"]+)">' + r_suballpages2 = r'Special:Allpages/(?P<from>[^>"]+)">' + r_suballpages3 = r'&from=(?P<from>[^>"]+)" title="[^>]+">' + if re.search(r_suballpages1, raw): + r_suballpages = r_suballpages1 + elif re.search(r_suballpages2, raw): + r_suballpages = r_suballpages2 + elif re.search(r_suballpages3, raw): + r_suballpages = r_suballpages3 + else: + pass # perhaps no subpages + + # Should be enough subpages on Special:Allpages + deep = 50 + c = 0 + oldfr = "" + checked_suballpages = [] + rawacum = raw + while r_suballpages and re.search(r_suballpages, raw) and c < deep: + # load sub-Allpages + m = re.compile(r_suballpages).finditer(raw) + currfr = None + for i in m: + fr = i.group("from") + currfr = fr + + if oldfr == currfr: + # We are looping, exit the loop + pass + + if r_suballpages == r_suballpages1: + to = i.group("to") + name = f"{fr}-{to}" + url = "{}?title=Special:Allpages&namespace={}&from={}&to={}".format( + config.index, + namespace, + fr, + to, + ) # do not put urllib.parse.quote in fr or to + # fix, this regexp doesn't properly save everything? or does r_title fail on this + # type of subpage? (wikiindex) + elif r_suballpages == r_suballpages2: + # clean &namespace=\d, sometimes happens + fr = fr.split("&namespace=")[0] + name = fr + url = "{}?title=Special:Allpages/{}&namespace={}".format( + config.index, + name, + namespace, + ) + elif r_suballpages == r_suballpages3: + fr = fr.split("&namespace=")[0] + name = fr + url = "{}?title=Special:Allpages&from={}&namespace={}".format( + config.index, + name, + namespace, + ) + else: + assert False, "Unreachable" + + if name not in checked_suballpages: + # to avoid reload dupe subpages links + checked_suballpages.append(name) + Delay(config=config) + # print ('Fetching URL: ', url) + r = session.get(url=url, timeout=10) + raw = str(r.text) + raw = clean_HTML(raw) + rawacum += raw # merge it after removed junk + print( + " Reading", + name, + len(raw), + "bytes", + len(re.findall(r_suballpages, raw)), + "subpages", + len(re.findall(r_title, raw)), + "pages", + ) + + Delay(config=config) + + assert currfr is not None, "re.search found the pattern, but re.finditer fails, why?" + oldfr = currfr + c += 1 + + c = 0 + m = re.compile(r_title).finditer(rawacum) + for i in m: + t = undo_HTML_entities(text=i.group("title")) + if not t.startswith("Special:"): + if t not in titles: + titles.append(t) + c += 1 + print(" %d titles retrieved in the namespace %d" % (c, namespace)) + return titles + + +def getPageTitles(config: Config, session: requests.Session): + """Get list of page titles""" + # http://en.wikipedia.org/wiki/Special:AllPages + # http://wiki.archiveteam.org/index.php?title=Special:AllPages + # http://www.wikanda.es/wiki/Especial:Todas + print( + "Loading page titles from namespaces = %s" + % ( + ",".join([str(i) for i in config.namespaces]) + if config.namespaces + else "None" + ) + ) + titles = [] + if config.api: + try: + titles = getPageTitlesAPI(config=config, session=session) + except Exception: + traceback.print_exc() + print("Error: could not get page titles from the API") + titles = getPageTitlesScraper(config=config, session=session) + elif config.index: + titles = getPageTitlesScraper(config=config, session=session) + + titlesfilename = "{}-{}-titles.txt".format( + url2prefix_from_config(config=config), config.date + ) + titlesfile = open( + "{}/{}".format(config.path, titlesfilename), "wt", encoding="utf-8" + ) + c = 0 + for title in titles: + titlesfile.write(str(title) + "\n") + c += 1 + # TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times: + # main namespace and widget namespace. + # We can use sort -u in UNIX, but is it worth it? + titlesfile.write("--END--\n") + titlesfile.close() + print("Titles saved at...", titlesfilename) + + print("%d page titles loaded" % (c)) + return titlesfilename + +def checkTitleOk(config: Config): + try: + with FileReadBackwards( + "%s/%s-%s-titles.txt" + % ( + config.path, + url2prefix_from_config(config=config), + config.date, + ), + encoding="utf-8", + ) as frb: + lasttitle = frb.readline().strip() + if lasttitle == "": + lasttitle = frb.readline().strip() + except FileNotFoundError: + lasttitle = "" # probably file does not exists + + if lasttitle != "--END--": + return False + return True + + +# @overload +# def read_titles(config: Config, session: requests.Session, start: Optional[str]=None, batch: bool = False) -> Generator[str, None, None]: +# pass + +# @overload +# def read_titles(config: Config, session: requests.Session, start: Optional[str]=None, batch: int = 1) -> Generator[List[str], None, None]: +# pass + +def read_titles(config: Config, session: requests.Session, start: Optional[str]=None + ) -> Generator[str, None, None]: + """Read title list from a file, from the title "start" + + start: title to start reading from, if `None`, start from the beginning + """ + + if not checkTitleOk(config): + getPageTitles(config=config, session=session) + + titles_filename = "{}-{}-titles.txt".format( + url2prefix_from_config(config=config), config.date + ) + + with open(f"{config.path}/{titles_filename}", encoding="utf-8") as f: + yield from read_until_end(source=f, start=start) + +def read_until_end(source: Iterator[str], start: Optional[str]=None) -> Generator[str, None, None]: + seeking = start is not None + """ If True, we are looking for the `start` title to start reading from """ + end_reached = False + last_line = None + + for line in source: + line = line.strip() + + if line == "--END--": + end_reached = True + else: + end_reached = False + + if seeking and last_line != start: + last_line = line + continue + else: + seeking = False + + if last_line is not None: + yield last_line + + last_line = line + + if not end_reached: + raise EOFError("End of file flag `--END--` not found in the last line") \ No newline at end of file diff --git a/wikiteam3/dumpgenerator/api/wiki_check.py b/wikiteam3/dumpgenerator/api/wiki_check.py new file mode 100644 index 00000000..202c14c4 --- /dev/null +++ b/wikiteam3/dumpgenerator/api/wiki_check.py @@ -0,0 +1,136 @@ +import re + +import requests + +from wikiteam3.utils import get_random_UserAgent + + +def get_WikiEngine(url: str, session: requests.Session) -> str: + """Returns the wiki engine of a URL, if known""" + + if not session: + session = requests.Session() # Create a new session + session.headers.update({"User-Agent": get_random_UserAgent()}) + r = session.post(url=url, timeout=30) + if r.text == "" or r.status_code in [405, 403]: # adding 403 here break something elsewhere ? + if r.status_code == 403: + # https://github.com/saveweb/wikiteam3/issues/13 + # not sure if this is the best way to handle this + print("403 Forbidden, trying GET instead of POST:", url) + r = session.get(url=url, timeout=120) + result = r.text + + wikiengine = "Unknown" + if re.search( + '(?im)(<meta name="generator" content="DokuWiki)|dokuwiki__site', result + ): + wikiengine = "DokuWiki" + elif re.search( + '(?im)(alt="Powered by MediaWiki"|<meta name="generator" content="MediaWiki|class="mediawiki)', + result, + ): + wikiengine = "MediaWiki" + elif re.search( + '(?im)(>MoinMoin Powered</a>|<option value="LocalSiteMap">)', result + ): + wikiengine = "MoinMoin" + elif re.search( + "(?im)(twikiCurrentTopicLink|twikiCurrentWebHomeLink|twikiLink)", result + ): + wikiengine = "TWiki" + elif re.search("(?im)(<!--PageHeaderFmt-->)", result): + wikiengine = "PmWiki" + elif re.search( + '(?im)(<meta name="generator" content="PhpWiki|<meta name="PHPWIKI_VERSION)', + result, + ): + wikiengine = "PhpWiki" + elif re.search( + r'(?im)(<meta name="generator" content="Tiki Wiki|Powered by <a href="http://(www\.)?tiki\.org"| id="tiki-(top|main)")', + result, + ): + wikiengine = "TikiWiki" + elif re.search( + r'(?im)(foswikiNoJs|<meta name="foswiki\.|foswikiTable|foswikiContentFooter)', + result, + ): + wikiengine = "FosWiki" + elif re.search(r'(?im)(<meta http-equiv="powered by" content="MojoMojo)', result): + wikiengine = "MojoMojo" + elif re.search( + r'(?im)(id="xwiki(content|nav_footer|platformversion|docinfo|maincontainer|data)|/resources/js/xwiki/xwiki|XWiki\.webapppath)', + result, + ): + wikiengine = "XWiki" + elif re.search(r'(?im)(<meta id="confluence-(base-url|context-path)")', result): + wikiengine = "Confluence" + elif re.search(r'(?im)(<meta name="generator" content="Banana Dance)', result): + wikiengine = "Banana Dance" + elif re.search( + r'(?im)(Wheeled by <a class="external-link" href="http://www\.wagn\.org">|<body id="wagn">)', + result, + ): + wikiengine = "Wagn" + elif re.search(r'(?im)(<meta name="generator" content="MindTouch)', result): + wikiengine = "MindTouch" # formerly DekiWiki + elif re.search( + r'(?im)(<div class="wikiversion">\s*(<p>)?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")', + result, + ): + wikiengine = "JSPWiki" + elif re.search( + r'(?im)(Powered by:?\s*(<br ?/>)?\s*<a href="http://kwiki\.org">|\bKwikiNavigation\b)', + result, + ): + wikiengine = "Kwiki" + elif re.search(r'(?im)(Powered by <a href="http://www\.anwiki\.com")', result): + wikiengine = "Anwiki" + elif re.search( + '(?im)(<meta name="generator" content="Aneuch|is powered by <em>Aneuch</em>|<!-- start of Aneuch markup -->)', + result, + ): + wikiengine = "Aneuch" + elif re.search(r'(?im)(<meta name="generator" content="bitweaver)', result): + wikiengine = "bitweaver" + elif re.search(r'(?im)(powered by <a href="[^"]*\bzwiki.org(/[^"]*)?">)', result): + wikiengine = "Zwiki" + # WakkaWiki forks + elif re.search( + r'(?im)(<meta name="generator" content="WikkaWiki|<a class="ext" href="(http://wikka\.jsnx\.com/|http://wikkawiki\.org/)">)', + result, + ): + wikiengine = "WikkaWiki" # formerly WikkaWakkaWiki + elif re.search(r'(?im)(<meta name="generator" content="CoMa Wiki)', result): + wikiengine = "CoMaWiki" + elif re.search(r'(?im)(Fonctionne avec <a href="http://www\.wikini\.net)', result): + wikiengine = "WikiNi" + elif re.search(r'(?im)(Powered by <a href="[^"]*CitiWiki">CitiWiki</a>)', result): + wikiengine = "CitiWiki" + elif re.search( + r'(?im)(Powered by <a href="http://wackowiki\.com/|title="WackoWiki")', result + ): + wikiengine = "WackoWiki" + elif re.search(r'(?im)(Powered by <a href="http://www\.wakkawiki\.com)', result): + # This may not work for heavily modded/themed installations, e.g. + # http://operawiki.info/ + wikiengine = "WakkaWiki" + # Custom wikis used by wiki farms + elif re.search(r'(?im)(var wikispaces_page|<div class="WikispacesContent)', result): + wikiengine = "Wikispaces" + elif re.search( + r'(?im)(Powered by <a href="http://www\.wikidot\.com">|wikidot-privacy-button-hovertip|javascript:WIKIDOT\.page)', + result, + ): + wikiengine = "Wikidot" + elif re.search( + r"(?im)(IS_WETPAINT_USER|wetpaintLoad|WPC-bodyContentContainer)", result + ): + wikiengine = "Wetpaint" + elif re.search( + '(?im)(<div id="footer-pbwiki">|ws-nav-search|PBinfo *= *{)', result + ): + # formerly PBwiki + wikiengine = "PBworks" + # if wikiengine == 'Unknown': print (result) + + return wikiengine diff --git a/wikiteam3/dumpgenerator/cli/__init__.py b/wikiteam3/dumpgenerator/cli/__init__.py new file mode 100644 index 00000000..04998d05 --- /dev/null +++ b/wikiteam3/dumpgenerator/cli/__init__.py @@ -0,0 +1,3 @@ +from .cli import get_parameters +from .greeter import bye, welcome +from .delay import Delay diff --git a/wikiteam3/dumpgenerator/cli/cli.py b/wikiteam3/dumpgenerator/cli/cli.py new file mode 100644 index 00000000..cfd80858 --- /dev/null +++ b/wikiteam3/dumpgenerator/cli/cli.py @@ -0,0 +1,567 @@ + +import argparse +import datetime +import http +import http.cookiejar +import os +import queue +import re +import sys +import traceback +from typing import Tuple + +import requests +from requests.adapters import DEFAULT_RETRIES as REQUESTS_DEFAULT_RETRIES +from requests.adapters import HTTPAdapter +import urllib3 + +from wikiteam3.dumpgenerator.api import ( + check_retry_API, + get_WikiEngine, + mediawiki_get_API_and_Index, +) +from wikiteam3.dumpgenerator.api.index_check import check_index +from wikiteam3.dumpgenerator.cli.delay import Delay +from wikiteam3.dumpgenerator.config import Config, OtherConfig, new_config +from wikiteam3.dumpgenerator.version import getVersion +from wikiteam3.utils import ( + get_random_UserAgent, + mod_requests_text, + url2prefix_from_config, +) +from wikiteam3.utils.login import uniLogin +from wikiteam3.utils.monkey_patch import SessionMonkeyPatch, WakeTLSAdapter +from wikiteam3.utils.user_agent import setup_random_UserAgent +from wikiteam3.utils.util import ALL_NAMESPACE_FLAG + + +def getArgumentParser(): + parser = argparse.ArgumentParser(description="") + + # General params + parser.add_argument("-v", "--version", action="version", version=getVersion()) + parser.add_argument( + "--cookies", metavar="cookies.txt", help="path to a cookies.txt file" + ) + parser.add_argument( + "--delay", metavar="1.5", default=1.5, type=float, + help="adds a delay (in seconds) " + "[NOTE: most HTTP servers have a 5s HTTP/1.1 keep-alive timeout, you should consider it " + "if you wanna reuse the connection]" + ) + parser.add_argument( + "--retries", metavar="5", default=5, help="Maximum number of retries for each request before failing." + ) + parser.add_argument( + "--hard-retries", metavar="3", default=3, help="Maximum number of hard retries for each request before failing. (for now, this only controls the hard retries during images downloading)" + ) + parser.add_argument("--path", help="path to store wiki dump at") + parser.add_argument( + "--resume", + action="store_true", + help="resumes previous incomplete dump (requires --path)", + ) + parser.add_argument("--force", action="store_true", + help="download it even if Wikimedia site or a recent dump exists in the Internet Archive") + parser.add_argument("--user", help="Username if MediaWiki authentication is required.") + parser.add_argument( + "--pass", dest="password", help="Password if MediaWiki authentication is required." + ) + parser.add_argument( + "--http-user", dest="http_user", help="Username if HTTP authentication is required." + ) + parser.add_argument( + "--http-pass", dest="http_password", help="Password if HTTP authentication is required." + ) + parser.add_argument( + "--http-method", dest="http_method", default="POST", choices=["GET", "POST"], help="HTTP method to use when making API requests to the wiki (default: POST)" + ) + parser.add_argument( + '--insecure', action='store_true', help='Disable SSL certificate verification' + ) + parser.add_argument( + "--user-agent", + type=str, + default=f"wikiteam3/{getVersion()} (WikiTeam; ArchiveTeam) wikiteam3dumpgenerator (+github.com/saveweb/wikiteam3; +wiki.archiveteam.org/index.php/wikiTeam) We respect HTTP Retry-After header", + # help="User-Agent to use for requests (default: wikiteam3/<version> ...)", + help=argparse.SUPPRESS, # private option + ) + parser.add_argument( + "--verbose", action="store_true", help="" + ) + parser.add_argument( + "--api_chunksize", metavar="50", default=50, help="Chunk size for MediaWiki API (arvlimit, ailimit, etc.)" + ) + + # URL params + group_WikiOrAPIOrIndex = parser.add_argument_group() + group_WikiOrAPIOrIndex.add_argument( + "wiki", default="", nargs="?", help="URL to wiki (e.g. http://wiki.domain.org), auto detects API and index.php" + ) + group_WikiOrAPIOrIndex.add_argument( + "--api", help="URL to API (e.g. http://wiki.domain.org/w/api.php)" + ) + group_WikiOrAPIOrIndex.add_argument( + "--index", help="URL to index.php (e.g. http://wiki.domain.org/w/index.php), (not supported with --images on newer(?) MediaWiki without --api)" + ) + group_WikiOrAPIOrIndex.add_argument( + "--index-check-threshold", metavar="0.80", default=0.80, type=float, + help="pass index.php check if result is greater than (>) this value (default: 0.80)" + ) + + # Download params + group_download = parser.add_argument_group( + "Data to download", "What info download from the wiki" + ) + group_download.add_argument( + "--xml", + action="store_true", + help="Export XML dump using Special:Export (index.php). (supported with --curonly)", + ) + group_download.add_argument( + "--curonly", action="store_true", help="store only the latest revision of pages" + ) + group_download.add_argument( + "--xmlapiexport", + action="store_true", + help="Export XML dump using API:revisions instead of Special:Export, use this when Special:Export fails and xmlrevisions not supported. (supported with --curonly)", + ) + group_download.add_argument( + "--xmlrevisions", + action="store_true", + help="Export all revisions from an API generator (API:Allrevisions). MediaWiki 1.27+ only. (not supported with --curonly)", + ) + group_download.add_argument( + "--xmlrevisions_page", + action="store_true", + help="[[! Development only !]] Export all revisions from an API generator, but query page by page MediaWiki 1.27+ only. (default: --curonly)", + ) + group_download.add_argument( + "--redirects", action="store_true", help="Dump page redirects via API:Allredirects" + ) + group_download.add_argument( + "--namespaces", + metavar="1,2,3", + help="comma-separated value of namespaces to include (all by default)", + ) + group_download.add_argument( + "--images", action="store_true", help="Generates an image dump" + ) + group_image = parser.add_argument_group( + "Image dump options", "Options for image dump (--images)" + ) + group_image.add_argument( + "--add-referer-header", + metavar="auto|https://example.com", + type=str, + # help="Add Referer header to image requests. (auto: use wiki URL, URL: use the given URL)", + help=argparse.SUPPRESS, # private option + ) + group_image.add_argument( + "--bypass-cdn-image-compression", + action="store_true", + help="Bypass CDN image compression. (CloudFlare Polish, etc.) [WARNING: This will increase CDN origin traffic, and not effective for all HTTP Server/CDN, please don't use this blindly.]", + ) + group_image.add_argument( + "--image-timestamp-interval", + metavar="2019-01-02T01:36:06Z/2023-08-12T10:36:06Z", + help="Only download images uploaded in the given time interval. [format: ISO 8601 UTC interval] " + "(only works with api)", + ) + group_image.add_argument( + "--ia-wbm-booster", + type=int, + default=0, + choices=[0, 1, 2, 3], + required=False, + help="Download images from Internet Archive Wayback Machine if possible, reduce the bandwidth usage of the wiki. " + "[0: disabled (default), 1: use earliest snapshot, 2: use latest snapshot, " + "3: the closest snapshot to the image's upload time]", + ) + + # Assertions params + group_assert = parser.add_argument_group( + "Assertions", + "What assertions to check before actually downloading, if any assertion fails, program will exit with exit code 45. " + "[NOTE: This feature requires correct siteinfo API response from the wiki, and not working properly with some wikis. " + "But it's useful for mass automated archiving, so you can schedule a re-run for HUGE wiki that may run out of your disk]" + ) + group_assert.add_argument( + "--assert-max-pages", metavar="123", type=int, default=None, dest="assert_max_pages", + help="Maximum number of pages to download" + ) + group_assert.add_argument( + "--assert-max-edits", metavar="123", type=int, default=None, dest="assert_max_edits", + help="Maximum number of edits to download" + ) + group_assert.add_argument( + "--assert-max-images", metavar="123", type=int, default=None, dest="assert_max_images", + help="Maximum number of images to download" + ) + group_assert.add_argument( + "--assert-max-images-bytes", metavar="123", type=int, default=None, dest="assert_max_images_bytes", + help="Maximum number of bytes to download for images [NOTE: this assert happens after downloading images list]" + ) + + # Meta info params + group_meta = parser.add_argument_group( + "Meta info", "What meta info to retrieve from the wiki" + ) + group_meta.add_argument( + "--get-wiki-engine", action="store_true", help="returns the wiki engine" + ) + group_meta.add_argument( + "--failfast", + action="store_true", + help="[lack maintenance] Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.", + ) + + group_upload = parser.add_argument_group("wikiteam3uploader params") + group_upload.add_argument('--upload', action='store_true', + help='(run `wikiteam3uplaoder` for you) Upload wikidump to Internet Archive after successfully dumped' + ) + group_upload.add_argument("-g", "--uploader-arg", dest="uploader_args", action='append', default=[], + help="Arguments for uploader.") + + return parser + + +def checkParameters(args=argparse.Namespace()) -> bool: + + passed = True + + # Don't mix download params and meta info params + if (args.xml or args.images) and (args.get_wiki_engine): + print("ERROR: Don't mix download params and meta info params") + passed = False + + if [args.xmlrevisions, args.xmlapiexport, args.xmlrevisions_page].count(True) > 1: + print("ERROR: --xmlrevisions, --xmlapiexport, --xmlrevisions_page are mutually exclusive") + passed = False + + if not args.xml: + if args.xmlrevisions or args.xmlapiexport or args.xmlrevisions_page: + print("ERROR: --xmlrevisions, --xmlapiexport, --xmlrevisions_page require --xml") + passed = False + + # No download params and no meta info params? Exit + if not any([args.xml, args.images, args.redirects, args.get_wiki_engine]): + print("ERROR: Use at least one download param or meta info param") + passed = False + + # Check user and pass (one requires both) + if (args.user and not args.password) or (args.password and not args.user): + print("ERROR: Both --user and --pass are required for authentication.") + passed = False + + # Check http-user and http-pass (one requires both) + if (args.http_user and not args.http_password) or (args.http_password and not args.http_user): + print("ERROR: Both --http-user and --http-pass are required for authentication.") + passed = False + + # --curonly requires --xml + if args.curonly and not args.xml: + print("ERROR: --curonly requires --xml") + passed = False + + # --xmlrevisions not supported with --curonly + if args.xmlrevisions and args.curonly: + print("ERROR: --xmlrevisions not supported with --curonly") + passed = False + + # Check URLs + for url in [args.api, args.index, args.wiki]: + if url and (not url.startswith("http://") and not url.startswith("https://")): + print(url) + print("ERROR: URLs must start with http:// or https://") + passed = False + + return passed + +def get_parameters(params=None) -> Tuple[Config, OtherConfig]: + # if not params: + # params = sys.argv + + parser = getArgumentParser() + args = parser.parse_args(params) + if checkParameters(args) is not True: + print("\n\n") + parser.print_help() + sys.exit(1) + # print (args) + + ######################################## + + # Create session + mod_requests_text(requests) # monkey patch # type: ignore + session = requests.Session() + patch_sess = SessionMonkeyPatch(session=session, hard_retries=1) # hard retry once to avoid spending too much time on initial detection + patch_sess.hijack() + def print_request(r: requests.Response, *args, **kwargs): + # TODO: use logging + # print("H:", r.request.headers) + for _r in r.history: + print("Resp (history): ", _r.request.method, _r.status_code, _r.reason, _r.url) + print(f"Reqs: {r.request.method} {r.url}, {r.request.body}") + print(f"Resp: {r.request.method} {r.status_code} {r.reason} {r.url}") + if r.raw._connection.sock: + print(f"Conn: {r.raw._connection.sock.getsockname()} -> {r.raw._connection.sock.getpeername()[0]}") + + if args.verbose: + session.hooks['response'].append(print_request) + + # Custom session retry + __retries__ = REQUESTS_DEFAULT_RETRIES + try: + from urllib3.util.retry import Retry + + # Courtesy datashaman https://stackoverflow.com/a/35504626 + class CustomRetry(Retry): + def increment(self, method=None, url=None, *args, **kwargs): + if '_pool' in kwargs: + conn = kwargs['_pool'] # type: urllib3.connectionpool.HTTPSConnectionPool + if 'response' in kwargs: + try: + # drain conn in advance so that it won't be put back into conn.pool + kwargs['response'].drain_conn() + except Exception: + pass + # Useless, retry happens inside urllib3 + # for adapters in session.adapters.values(): + # adapters: HTTPAdapter + # adapters.poolmanager.clear() + + # Close existing connection so that a new connection will be used + if hasattr(conn, 'pool'): + pool = conn.pool # type: queue.Queue + try: + # Don't directly use this, This closes connection pool by making conn.pool = None + conn.close() + except Exception: + pass + conn.pool = pool + return super(CustomRetry, self).increment(method=method, url=url, *args, **kwargs) + + def sleep(self, response=None): + backoff = self.get_backoff_time() + if backoff <= 0: + return + if response is not None: + msg = 'req retry (%s)' % response.status + else: + msg = None + Delay(config=None, msg=msg, delay=backoff+5) + + __retries__ = CustomRetry( + total=int(args.retries), backoff_factor=1, + status_forcelist=[500, 502, 503, 504, 429], + allowed_methods=['DELETE', 'PUT', 'GET', 'OPTIONS', 'TRACE', 'HEAD', 'POST'] + ) + except Exception: + traceback.print_exc() + + # Mount adapters + for protocol in ['http://', 'https://']: + session.mount(protocol, + WakeTLSAdapter(max_retries=__retries__) if args.insecure + else HTTPAdapter(max_retries=__retries__) + ) + # Disable SSL verification + if args.insecure: + session.verify = False + requests.packages.urllib3.disable_warnings() # type: ignore + print("WARNING: SSL certificate verification disabled") + + # Set cookies + cj = http.cookiejar.MozillaCookieJar() + if args.cookies: + cj.load(args.cookies) + print("Using cookies from %s" % args.cookies) + session.cookies = cj + + # Setup user agent + if args.user_agent: + session.headers.update({"User-Agent": args.user_agent}) + if args.user_agent == "random": + session.headers.update({"User-Agent": get_random_UserAgent()}) + setup_random_UserAgent(session) # monkey patch + + # Set accept header + session.headers.update({"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"}) + + # Set HTTP Basic Auth + if args.http_user and args.http_password: + session.auth = (args.http_user, args.http_password) + + # Execute meta info params + if args.wiki: + if args.get_wiki_engine: + print(get_WikiEngine(url=args.wiki, session=session)) + sys.exit(0) + + # Get API and index and verify + api = args.api if args.api else "" + index = args.index if args.index else "" + if api == "" or index == "": + if args.wiki: + if get_WikiEngine(args.wiki, session=session) == "MediaWiki": + api2, index2 = mediawiki_get_API_and_Index(args.wiki, session=session) + if not api: + api = api2 + if not index: + index = index2 + else: + print("ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki") + sys.exit(1) + else: + if api == "": + pass + elif index == "": + index = "/".join(api.split("/")[:-1]) + "/index.php" + print("Guessing index.php from API URL: ", index) + + # print (api) + # print (index) + index2 = None + + check, checkedapi = False, None + if api: + check, checkedapi = check_retry_API( + api=api, + apiclient=args.xmlrevisions, + session=session, + ) + + if api and check: + # Replace the index URL we got from the API check + index2 = check[1] + api = checkedapi + print("API is OK: ", checkedapi) + else: + if index and not args.wiki: + print("API not available. Trying with index.php only.") + args.api = None + else: + print("Error in API. Please, provide a correct path to API") + sys.exit(1) + + # login if needed + # TODO: Re-login after session expires + if args.user and args.password: + _session = uniLogin(api=api, index=index, session=session, username=args.user, password=args.password) + if _session: + session = _session + print("-- Login OK --") + else: + print("-- Login failed --") + + # check index + threshold: float = args.index_check_threshold + + if index and check_index(index=index, logged_in=bool(args.cookies), session=session) > threshold: + print("index.php is OK") + else: + index = index2 + if index and index.startswith("//"): + index = args.wiki.split("//")[0] + index + if index and check_index(index=index, logged_in=bool(args.cookies), session=session) > threshold: + print("index.php is OK") + else: + try: + index = "/".join(index.split("/")[:-1]) + except AttributeError: + index = None + if index and check_index(index=index, logged_in=bool(args.cookies), session=session) > threshold: + print("index.php is OK") + else: + print("Error in index.php.") + if not (args.xmlrevisions or args.xmlapiexport): + print( + "Please, provide a correct path to index.php or use --xmlrevisions or --xmlapiexport. Terminating." + ) + sys.exit(11) + + + namespaces = [ALL_NAMESPACE_FLAG] + # Process namespace inclusions + if args.namespaces: + # fix, why - ? and... --namespaces= all with a space works? + if ( + re.search(r"[^\d, \-]", args.namespaces) + and args.namespaces.lower() != ALL_NAMESPACE_FLAG + ): + print( + "Invalid namespace values.\nValid format is integer(s) separated by commas" + ) + sys.exit(1) + else: + ns = re.sub(" ", "", args.namespaces) + if ns.lower() == ALL_NAMESPACE_FLAG: + namespaces = [ALL_NAMESPACE_FLAG] + else: + namespaces = [int(i) for i in ns.split(",")] + + config = Config( + curonly = args.curonly, + date = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d"), + api = api, + failfast = args.failfast, + http_method = args.http_method, + api_chunksize = int(args.api_chunksize), + index = index, + images = args.images, + redirects = args.redirects, + logs = False, + xml = args.xml, + xmlapiexport = args.xmlapiexport, + xmlrevisions = args.xmlrevisions or args.xmlrevisions_page, + xmlrevisions_page = args.xmlrevisions_page, + namespaces = namespaces, + path = args.path and os.path.normpath(args.path) or "", + delay = args.delay, + retries = int(args.retries), + ) + + + other = OtherConfig( + resume = args.resume, + force = args.force, + session = session, + bypass_cdn_image_compression = args.bypass_cdn_image_compression, + add_referer_header = args.add_referer_header, + image_timestamp_interval = args.image_timestamp_interval, + ia_wbm_booster = args.ia_wbm_booster, + + assert_max_pages = args.assert_max_pages, + assert_max_edits = args.assert_max_edits, + assert_max_images = args.assert_max_images, + assert_max_images_bytes = args.assert_max_images_bytes, + + hard_retries = int(args.hard_retries), + + upload = args.upload, + uploader_args = args.uploader_args, + ) + + # calculating path, if not defined by user with --path= + if not config.path: + config.path = "./{}-{}-wikidump".format( + url2prefix_from_config(config=config), + config.date, + ) + print("No --path argument provided. Defaulting to:") + print(" [working_directory]/[domain_prefix]-[date]-wikidump") + print("Which expands to:") + print(" " + config.path) + + if config.delay == 1.5: + print(f"--delay is the default value of {config.delay}") + print( + f"There will be a {config.delay} second delay between HTTP calls in order to keep the server from timing you out." + ) + print( + "If you know that this is unnecessary, you can manually specify '--delay 0.0'." + ) + + patch_sess.release() + return config, other diff --git a/wikiteam3/dumpgenerator/cli/delay.py b/wikiteam3/dumpgenerator/cli/delay.py new file mode 100644 index 00000000..83cadc4f --- /dev/null +++ b/wikiteam3/dumpgenerator/cli/delay.py @@ -0,0 +1,61 @@ +import itertools +import threading +import time +from typing import Optional + +from wikiteam3.dumpgenerator.config import Config, load_config +import timeit + +class Delay: + done: bool = False + lock: threading.Lock = threading.Lock() + + def animate(self): + progress_dots = itertools.cycle([".", "/", "-", "\\"]) + for dot in progress_dots: + with self.lock: + if self.done: + return + + print("\r" + self.ellipses, end=dot) + + time.sleep(4) + + def __init__(self, config: Optional[Config]=None, + msg: Optional[str]=None, delay: Optional[float]=None, dynamic: bool=True): + """Add a delay if configured for that + + if delay is None, use config.delay + if dynamic is True, load config.json every time delay is called + """ + + if delay is None: + assert isinstance(config, Config) + if dynamic: + assert config + t = timeit.default_timer() + try: + config_dynamic = load_config(config=config, config_filename="config.json") + except Exception as e: + print("Unable to load config.json for dynamic delay (fallback to static delay):", e) + config_dynamic = config + delay = config_dynamic.delay - (timeit.default_timer() - t) # compensation + else: + delay = config.delay + if delay <= 0: + return + + if msg: + self.ellipses = ("Delay %.1fs: %s " % (delay, msg)) + else: + self.ellipses = ("Delay %.1fs" % (delay)) + + ellipses_animation = threading.Thread(target=self.animate) + ellipses_animation.daemon = True + ellipses_animation.start() + + time.sleep(delay) + + with self.lock: + self.done = True + print("\r" + " " * len(self.ellipses), end=" \r") diff --git a/wikiteam3/dumpgenerator/cli/greeter.py b/wikiteam3/dumpgenerator/cli/greeter.py new file mode 100644 index 00000000..75d35231 --- /dev/null +++ b/wikiteam3/dumpgenerator/cli/greeter.py @@ -0,0 +1,35 @@ +import datetime + +from wikiteam3.dumpgenerator.version import getVersion + + +def welcome(): + """Opening message""" + + welcome_string = f"# Welcome to DumpGenerator {getVersion()} by WikiTeam3 (GPL v3)" + welcome_string += " " * (73 - len(welcome_string) - 1) + "#" + copyright_string = f"# Copyright (C) 2011-{datetime.datetime.now(datetime.timezone.utc).year} WikiTeam developers" + copyright_string += " " * (73 - len(copyright_string) - 1) + "#" + + return f"""\ +######################################################################### +{welcome_string} +# More info at: <https://github.com/saveweb/wikiteam3> # +{copyright_string} +######################################################################### +""" + + +def bye(wikidump_dir = None): + """Closing message""" + print( +f""" +---> Done <--- + +If this is a public wiki, please, consider publishing this dump to the Internet Archive: + +`wikiteam3uploader {wikidump_dir if wikidump_dir else ''}` + +Good luck! Bye! +""" +) \ No newline at end of file diff --git a/wikiteam3/dumpgenerator/config.py b/wikiteam3/dumpgenerator/config.py new file mode 100644 index 00000000..a6a4203a --- /dev/null +++ b/wikiteam3/dumpgenerator/config.py @@ -0,0 +1,123 @@ +import dataclasses +from dataclasses import field +import json +from typing import List, Optional + +import requests + + +def _dataclass_from_dict(klass_or_obj, d: dict): + if isinstance(klass_or_obj, type): # klass + ret = klass_or_obj() + else: + ret = klass_or_obj + for k,v in d.items(): + if hasattr(ret, k): + setattr(ret, k, v) + return ret + + +@dataclasses.dataclass +class Config: + def asdict(self): + return dataclasses.asdict(self) + + # General params + delay: float = 0.0 + """ Delay between requests """ + retries: int = 0 + """ Number of retries """ + path: str = '' + """ Path to save the wikidump """ + logs: bool = False + """ + Save MediaWiki logs #NOTE: this feature is not implemented yet + https://www.mediawiki.org/wiki/Manual:Logging_table + """ + date: str = False + """ + Date of the dump + `datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d")` + """ + + # URL params + index: str = '' + api: str = '' + + # Download params + xml: bool = False + curonly: bool = False + xmlapiexport: bool = False + xmlrevisions: bool = False + xmlrevisions_page: bool = False + images: bool = False + redirects: bool = False + namespaces: List[int] = field(default_factory=list) + """ [ALL_NAMESPACE_FLAG] or [int,...] """ + exnamespaces: List[int] = field(default_factory=list) + """ [REMOVED FEATURE] keep for config backward compatibility. check wikiteam3#35 """ + + api_chunksize: int = 0 # arvlimit, ailimit, etc + export: str = '' + """ `Special:Export` page name """ + http_method: str = '' + """ GET/POST """ + + # Meta info params + failfast: bool = False + + templates: bool = False # TODO: rename to `xml_export_include_templates` + """ + Whether to include `&templates=1` parameter in the `Special:Export` (--xml) export action. + https://www.mediawiki.org/wiki/Manual:Parameters_to_Special:Export#Available_parameters + + NOTE: this config is not used to control the export of templates namespace (--namespaces). + """ + +def new_config(configDict) -> Config: + return _dataclass_from_dict(Config, configDict) + +def load_config(config: Config, config_filename: str): + """Load config file""" + + config_dict = dataclasses.asdict(config) + + if config.path: + try: + with open(f"{config.path}/{config_filename}", encoding="utf-8") as infile: + config_dict.update(json.load(infile)) + return new_config(config_dict) + except FileNotFoundError: + raise + + raise FileNotFoundError(f"Config file {config_filename} not found") + +def save_config(config: Config, config_filename: str): + """Save config file""" + + with open(f"{config.path}/{config_filename}", "w", encoding="utf-8") as outfile: + json.dump(dataclasses.asdict(config), outfile, indent=4, sort_keys=True) + + +@dataclasses.dataclass +class OtherConfig: + resume: bool + force: bool + session: requests.Session + bypass_cdn_image_compression: bool + add_referer_header: Optional[str] + '''None, "auto", {URL}''' + image_timestamp_interval: Optional[str] + ''' 2019-01-02T01:36:06Z/2023-08-12T10:36:06Z ''' + ia_wbm_booster: int + + assert_max_pages: Optional[int] + assert_max_edits: Optional[int] + assert_max_images: Optional[int] + assert_max_images_bytes: Optional[int] + + hard_retries: int + """ Number of hard retries """ + + upload: bool + uploader_args: List[str] \ No newline at end of file diff --git a/wikiteam3/dumpgenerator/dump/__init__.py b/wikiteam3/dumpgenerator/dump/__init__.py new file mode 100644 index 00000000..c3e470c7 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/__init__.py @@ -0,0 +1 @@ +from .generator import DumpGenerator diff --git a/wikiteam3/dumpgenerator/dump/generator.py b/wikiteam3/dumpgenerator/dump/generator.py new file mode 100644 index 00000000..9b2ac3fa --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/generator.py @@ -0,0 +1,275 @@ +import os +import re +import subprocess +import sys + +from file_read_backwards import FileReadBackwards + +from wikiteam3.dumpgenerator.config import OtherConfig, load_config, save_config +from wikiteam3.dumpgenerator.config import Config +from wikiteam3.dumpgenerator.cli import get_parameters, bye, welcome +from wikiteam3.dumpgenerator.dump.image.image import FILENAME_LIMIT, Image +from wikiteam3.dumpgenerator.dump.misc.index_php import save_IndexPHP +from wikiteam3.dumpgenerator.dump.misc.special_logs import save_SpecialLog +from wikiteam3.dumpgenerator.dump.misc.special_version import save_SpecialVersion +from wikiteam3.dumpgenerator.dump.misc.site_info import assert_siteinfo, get_siteinfo, save_siteinfo +from wikiteam3.dumpgenerator.dump.redirect.redirects_dump import generate_redirects_dump +from wikiteam3.dumpgenerator.dump.xmldump.xml_dump import generate_XML_dump +from wikiteam3.dumpgenerator.dump.xmldump.xml_integrity import check_XML_integrity +from wikiteam3.dumpgenerator.log import log_error +from wikiteam3.utils import url2prefix_from_config, undo_HTML_entities, avoid_WikiMedia_projects +from wikiteam3.utils.ia_checker import any_recent_ia_item_exists +from wikiteam3.utils.util import ALL_DUMPED_MARK, int_or_zero, mark_as_done, underscore +from wikiteam3.utils.wiki_avoid import avoid_robots_disallow + + +class DumpGenerator: + configfilename = "config.json" + + @staticmethod + def __init__(params=None): + """Main function""" + config_filename = DumpGenerator.configfilename + config, other = get_parameters(params=params) + avoid_WikiMedia_projects(config=config, other=other) + avoid_robots_disallow(config=config, other=other) + + print(welcome()) + print("Analysing %s" % (config.api if config.api else config.index)) + + # do not enter if resume is requested from begining + while not other.resume and os.path.isdir(config.path): + print('\nWarning!: "%s" path exists' % (config.path)) + reply = "y" if config.failfast else "" + while reply.lower()[:1] not in ["y", "n"]: + reply = input( + 'There is a dump in "%s", probably incomplete.\n' + 'If you choose resume, to avoid conflicts, some parameters ' + 'you have chosen in the current session will be ignored\n' + 'and the parameters available in "%s/%s" will be loaded.\n' + 'Do you want to resume (y/n)? ' + % (config.path, config.path, config_filename) + ) + reply = reply.lower()[:1] + if reply == "y": + if not os.path.isfile("{}/{}".format(config.path, config_filename)): + print("No config file found. I can't resume. Aborting.") + sys.exit(1) + print("You have selected: YES") + other.resume = True + break + elif reply == "n": + print("You have selected: NO.\nbye.") + # other.resume = False + sys.exit(0) + + if asserts_enabled := [(arg, v) for arg, v in other.__dict__.items() if arg.startswith("assert_") and v is not None]: + site_info = get_siteinfo(config=config, session=other.session) + assert_siteinfo(site_info, other) + [print(f"--{arg}: {v}, passed") for arg, v in asserts_enabled] + + if other.resume: + print("Loading config file to resume...") + config = load_config(config=config, config_filename=config_filename) + else: + if not other.force and any_recent_ia_item_exists(config, days=365): + print("A dump of this wiki was uploaded to IA in the last 365 days.") + print("If you want to generate a new dump, use --force") + sys.exit(88) + + os.mkdir(config.path) + save_config(config=config, config_filename=config_filename) + + if other.resume: + DumpGenerator.resumePreviousDump(config=config, other=other) + else: + DumpGenerator.createNewDump(config=config, other=other) + + if config.index: + save_IndexPHP(config=config, session=other.session) + save_SpecialVersion(config=config, session=other.session) + if config.api: + save_siteinfo(config=config, session=other.session) + + mark_as_done(config=config, mark=ALL_DUMPED_MARK) + bye(config.path) + if other.upload: + print('Calling uploader... (--upload)') + retcode = subprocess.call([sys.executable, '-m', 'wikiteam3.uploader', config.path] + other.uploader_args, + shell=False) + if retcode: + print(f'--upload: Failed: {retcode}') + sys.exit(retcode) + + print('--upload: Done') + + @staticmethod + def createNewDump(config: Config, other: OtherConfig): + # we do lazy title dumping here :) + images = [] + print("Trying generating a new dump into a new directory...") + if config.xml: + generate_XML_dump(config=config, session=other.session) + check_XML_integrity(config=config, session=other.session) + if config.redirects: + generate_redirects_dump(config=config, session=other.session) + if config.images: + images += Image.get_image_names(config=config, session=other.session) + Image.save_image_names(config=config, other=other, images=images) + Image.generate_image_dump( + config=config, other=other, images=images, session=other.session + ) + if config.logs: + pass # TODO + # save_SpecialLog(config=config, session=other.session) + + @staticmethod + def resumePreviousDump(config: Config, other: OtherConfig): + images = [] + print("Resuming previous dump process...") + if config.xml: + + # checking xml dump + xml_is_complete = False + last_xml_title = None + last_xml_revid = None + try: + with FileReadBackwards( + "%s/%s-%s-%s.xml" + % ( + config.path, + url2prefix_from_config(config=config), + config.date, + "current" if config.curonly else "history", + ), + encoding="utf-8", + ) as frb: + for l in frb: + if l.strip() == "</mediawiki>": + # xml dump is complete + xml_is_complete = True + break + + xmlrevid = re.search(r" <id>([^<]+)</id>", l) + if xmlrevid: + last_xml_revid = int(xmlrevid.group(1)) + xmltitle = re.search(r"<title>([^<]+)", l) + if xmltitle: + last_xml_title = undo_HTML_entities(text=xmltitle.group(1)) + break + + except Exception: + pass # probably file does not exists + + if xml_is_complete: + print("XML dump was completed in the previous session") + elif last_xml_title: + # resuming... + print('Resuming XML dump from "%s" (revision id %s)' % (last_xml_title, last_xml_revid)) + generate_XML_dump( + config=config, + session=other.session, + resume=True, + ) + else: + # corrupt? only has XML header? + print("XML is corrupt? Regenerating...") + generate_XML_dump(config=config, session=other.session) + + + if config.redirects: + generate_redirects_dump(config=config, resume=True, session=other.session) + + + if config.images: + # load images list + last_line = "" + imagesFilePath = "%s/%s-%s-images.txt" % (config.path, url2prefix_from_config(config=config), config.date) + if os.path.exists(imagesFilePath): + with open(imagesFilePath, "r", encoding="utf-8") as f: + while line := f.readline().rstrip(): + last_line = line + if "\t" in line: + images.append(line.split("\t")) + + if len(images)>0 and len(images[0]) < 5: + print( + "Warning: Detected old images list (images.txt) format.\n"+ + "You can delete 'images.txt' manually and restart the script." + ) + sys.exit(9) + if last_line == "--END--": + print("Image list was completed in the previous session") + else: + print("Image list is incomplete. Reloading...") + # do not resume, reload, to avoid inconsistences, deleted images or + # so + images = Image.get_image_names(config=config, session=other.session) + Image.save_image_names(config=config, other=other, images=images) + # checking images directory + files = set() + du_dir: int = 0 # du -s {config.path}/images + if os.path.exists(f"{config.path}/images"): + c_loaded = 0 + for file in os.scandir(f"{config.path}/images"): + if not file.is_file(): + print(f"Warning: {file.name} is not a file") + continue + + du_dir += file.stat().st_size + + if underscore(file.name) != file.name: # " " in filename + os.rename(f"{config.path}/images/{file.name}", + f"{config.path}/images/{underscore(file.name)}") + print(f"Renamed {file.name} to {underscore(file.name)}") + + files.add(underscore(file.name)) + + c_loaded += 1 + if c_loaded % 12000 == 0: + print(f"[progress] {c_loaded} files loaded...", end="\r") + print(f"{c_loaded} files in $wikidump/images/ dir, du -s: {du_dir} bytes ({du_dir/1024/1024/1024:.2f} GiB)") + + c_images_size = 0 + c_images_downloaded = 0 + c_images_downloaded_size = 0 + c_checked = 0 + + for filename, url, uploader, size, sha1, timestamp in images: + filename = underscore(filename) + if FILENAME_LIMIT < len(filename.encode('utf-8')): + log_error( + config=config, to_stdout=True, + text=f"Filename too long(>240 bytes), skipping: {filename}", + ) + continue + if filename in files: + c_images_downloaded += 1 + c_images_downloaded_size += int_or_zero(size) + c_checked += 1 + c_images_size += int_or_zero(size) + if c_checked % 100000 == 0: + print(f"checked {c_checked}/{len(images)} records", end="\r") + print(f"{len(images)} records in images.txt, {c_images_downloaded} files were saved in the previous session") + print(f"Estimated size of all images (images.txt): {c_images_size} bytes ({c_images_size/1024/1024/1024:.2f} GiB)") + if c_images_downloaded < len(images): + complete = False + print("WARNING: Some images were not saved in the previous session") + else: + complete = True + if complete: + # image dump is complete + print("Image dump was completed in the previous session") + else: + # we resume from previous image, which may be corrupted + # by the previous session ctrl-c or abort + Image.generate_image_dump( + config=config, + other=other, + images=images, + session=other.session, + ) + + if config.logs: + # fix + pass diff --git a/wikiteam3/dumpgenerator/dump/image/__init__.py b/wikiteam3/dumpgenerator/dump/image/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wikiteam3/dumpgenerator/dump/image/html_regexs.py b/wikiteam3/dumpgenerator/dump/image/html_regexs.py new file mode 100644 index 00000000..6871eda3 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/image/html_regexs.py @@ -0,0 +1,43 @@ +R_NEXT = r"(?<=&dir=prev)&offset=(?P\d+)" + +REGEX_CANDIDATES = [ + # [0] + # archiveteam 1.15.1 Yahoovideo.jpg (file) + # wikanda 1.15.5 Fernandocg + r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+\s*]+>(?P[^<]+)' + + # [1] + # wikijuegos 1.9.5 + # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old + # mediawiki version + ,r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+\s*[^<]+\s*[^<]+\s*]+>(?P[^<]+)' + + # [2] + # gentoowiki 1.18 + ,r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>]+>[^<]+]+>(?P[^<]+)' + + # [3] + # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= + # (desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
      + ,r'(?ism)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)' + + # [4] + ,( + r'(?im)\s*]+>(?P[^>]+)[^<]*?[^<]*?[^<]*?\s*' + r'[^\n\r]*?\s*' + r'[^<]*?\s*' + r'\s*()?(?P[^<]+?)()?\s*' + ) + + # [5] mediawiki 1.43.0 + # mediawiki.org-20240924 + ,( + r'(?im)\s*]+>(?P[^>]+)[^<]*?[^<]*?[^<]*?\s*' + r'[^\n\r]*?\s*' + r'[^<]*?\s*' + r'.*?]*?>(?P[^<]+?).+?' + ) +] diff --git a/wikiteam3/dumpgenerator/dump/image/image.py b/wikiteam3/dumpgenerator/dump/image/image.py new file mode 100644 index 00000000..9324677a --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/image/image.py @@ -0,0 +1,736 @@ +import datetime +import os +import re +import shutil +import sys +import time +import urllib.parse +import warnings +from pathlib import Path +from typing import Dict, List, Optional, Union + +import requests + +from wikiteam3.dumpgenerator.api import get_JSON, handle_StatusCode +from wikiteam3.dumpgenerator.cli import Delay +from wikiteam3.dumpgenerator.config import Config, OtherConfig +from wikiteam3.dumpgenerator.dump.image.html_regexs import R_NEXT, REGEX_CANDIDATES +from wikiteam3.dumpgenerator.exceptions import FileSha1Error, FileSizeError +from wikiteam3.dumpgenerator.log import log_error +from wikiteam3.dumpgenerator.version import getVersion +from wikiteam3.utils.identifier import url2prefix_from_config +from wikiteam3.utils.monkey_patch import SessionMonkeyPatch +from wikiteam3.utils.util import clean_HTML, int_or_zero, sha1bytes, sha1sum, space, underscore, undo_HTML_entities + +NULL = "null" +""" NULL value for image metadata """ +FILENAME_LIMIT = 240 +""" Filename not be longer than 240 **bytes**. (MediaWiki r98430 2011-09-29) """ +STDOUT_IS_TTY = sys.stdout and sys.stdout.isatty() + + +WBM_EARLIEST = 1 +WBN_LATEST = 2 +WBM_BEST = 3 + + +def check_response(r: requests.Response) -> None: + if r.headers.get("cf-polished", ""): + raise RuntimeError("Found cf-polished header in response, use --bypass-cdn-image-compression to bypass it") + +class Image: + + @staticmethod + def generate_image_dump(config: Config, other: OtherConfig, images: List[List], + session: requests.Session): + """ Save files and descriptions using a file list """ + + image_timestamp_intervals = None + if other.image_timestamp_interval: + image_timestamp_intervals = other.image_timestamp_interval.split("/") + assert len(image_timestamp_intervals) == 2 + image_timestamp_intervals = [ + datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ") + for x in image_timestamp_intervals] + + print("Retrieving images...") + images_dir = Path(config.path) / "images" + images_mismatch_dir = Path(config.path) / "images_mismatch" + [os.makedirs(dir_, exist_ok=True) or print(f'Creating "{dir_}" directory') + for dir_ in (images_dir, images_mismatch_dir) if not dir_.exists()] + + c_savedImageFiles = 0 + c_savedMismatchImageFiles = 0 + c_wbm_speedup_files = 0 + + + def delete_mismatch_image(filename_underscore: str) -> bool: + """ + Delete mismatch image (in `images_mismatch` directory) + + return True if file is deleted, False if file not exists + """ + assert filename_underscore == underscore(filename_underscore) + + if os.path.exists(images_mismatch_dir / filename_underscore): + os.remove(images_mismatch_dir / filename_underscore) + return True + return False + + + def modify_params(params: Optional[Dict] = None) -> Dict: + """ bypass Cloudflare Polish (image optimization) """ + if params is None: + params = {} + if other.bypass_cdn_image_compression is True: + # bypass Cloudflare Polish (image optimization) + # + params["_wiki_t"] = int(time.time()*1000) + params["_wikiteam3_nocdn"] = "init_req" # this value will be changed on hard retry + + return params + + def modify_headers(headers: Optional[Dict] = None) -> Dict: + """ add HTTP Referer header """ + if headers is None: + headers = {} + if other.add_referer_header: + url = config.index if config.index else config.api + parsed_url = urllib.parse.urlparse( + other.add_referer_header + if other.add_referer_header != "auto" + else url + ) + + headers["Referer"] = f"{parsed_url.scheme}://{parsed_url.netloc}/" + + return headers + + + patch_sess = SessionMonkeyPatch(session=session, config=config, hard_retries=other.hard_retries) + patch_sess.hijack() + + ia_session = requests.Session() + ia_session.headers.update({"User-Agent": f"wikiteam3/{getVersion()}"}) + + skip_to_filename = underscore('') # TODO: use this + + while images: + filename_raw, url_raw, uploader_raw, size, sha1, timestamp = images.pop(0) # reduce memory usage by poping + filename_underscore = underscore(filename_raw) + # uploader_underscore = space(uploader_raw) + + # https://github.com/saveweb/wikiteam3/issues/52 + # --- Fandom PNG skip logic when resuming --- + is_fandom = ( + "fandom.com" in (config.api or config.index) and + "static.wikia.nocookie.net" in url_raw + ) + is_pic = filename_underscore.lower().endswith(('.png', '.jpg', '.jpeg')) + mismatch_file = (Path(config.path) / "images_mismatch" / filename_underscore) + if is_fandom and is_pic and mismatch_file.is_file(): + print(f"Skipping Fandom PNG/JPG (already in images_mismatch): {filename_underscore}") + continue + # --- end skip logic --- + + if skip_to_filename and skip_to_filename != filename_underscore: + print(f" {filename_underscore}", end="\r") + continue + else: + skip_to_filename = '' + + to_download = True + + if image_timestamp_intervals: + if timestamp == NULL: + print(f" {filename_underscore}|timestamp is unknown: {NULL}, downloading anyway...") + else: + if not ( + image_timestamp_intervals[0] + <= datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ") + <= image_timestamp_intervals[1] + ): + print(f" timestamp {timestamp} is not in interval {other.image_timestamp_interval}: {filename_underscore}") + continue + else: + print(f" timestamp {timestamp} is in interval {other.image_timestamp_interval}: {filename_underscore}") + + # saving file + if filename_underscore != urllib.parse.unquote(filename_underscore): + print(f"WARNING: {filename_underscore}|filename may not be unquoted: {filename_underscore}") + if len(filename_underscore.encode('utf-8')) > FILENAME_LIMIT: + log_error( + config=config, to_stdout=True, + text=f"Filename is too long(>{FILENAME_LIMIT} bytes), skipping: '{filename_underscore}'", + ) + # TODO: hash as filename instead of skipping + continue + + filepath_space = images_dir / space(filename_underscore) + filepath_underscore = images_dir / filename_underscore + + if filepath_space.is_file(): + # rename file to underscore + shutil.move(filepath_space, filepath_underscore) + + # check if file already exists in 'images' dir and has the same size and sha1 + if ((size != NULL + and filepath_underscore.is_file() + and os.path.getsize(filepath_underscore) == int(size) + and sha1sum(filepath_underscore) == sha1) + or (sha1 == NULL and filepath_underscore.is_file())): + # sha1 is NULL if file not in original wiki (probably deleted, + # you will get a 404 error if you try to download it) + c_savedImageFiles += 1 + to_download = False + print_msg=f" {c_savedImageFiles}|sha1 matched: {filename_underscore}" + print(print_msg[0:70], end="\r") + if sha1 == NULL: + log_error(config=config, to_stdout=True, + text=f"sha1 is {NULL} for {filename_underscore}, file may not in wiki site (probably deleted). " + ) + else: + # Delay(config=config, delay=config.delay + random.uniform(0, 1)) + url = url_raw + + r: Optional[requests.Response] = None + if other.ia_wbm_booster: + def get_ia_wbm_response() -> Optional[requests.Response]: + """ Get response from Internet Archive Wayback Machine + return None if not found / failed """ + if other.ia_wbm_booster in (WBM_EARLIEST, WBN_LATEST): + ia_timestamp = other.ia_wbm_booster + elif other.ia_wbm_booster == WBM_BEST: + if timestamp != NULL: + ia_timestamp = [x for x in timestamp if x.isdigit()][0:8] + ia_timestamp = "".join(ia_timestamp) + else: + print(f"ia_wbm_booster: timestamp is {NULL}, use latest timestamp") + ia_timestamp = 2 + else: + raise ValueError(f"ia_wbm_booster is {other.ia_wbm_booster}, but it should be 0, 1, 2 or 3") + + available_api = "http://archive.org/wayback/available" + # TODO: cdx_api = "http://web.archive.org/cdx/search/cdx" + snap_url = f"https://web.archive.org/web/{ia_timestamp}id_/{url}" + + try: + _r = ia_session.get(available_api, params={"url": url}, headers={"User-Agent": "wikiteam3"}, + timeout=10) + if _r.status_code == 429: + raise Warning("IA API rate limit exceeded (HTTP 429)") + _r.raise_for_status() + api_result = _r.json() + if api_result["archived_snapshots"]: + r = ia_session.get(url=snap_url, allow_redirects=True) + # r.raise_for_status() + else: + r = None + except Exception as e: + print("ia_wbm_booster:",e) + r = None + + return r + r = get_ia_wbm_response() + + # verify response + if r and r.status_code != 200: + r = None + elif r and size != NULL and len(r.content) != int(size): # and r.status_code == 200: + # FileSizeError + # print(f"WARNING: {filename_unquoted} size should be {size}, but got {len(r.content)} from WBM, use original url...") + r = None + + if r is not None: + c_wbm_speedup_files += 1 + + + if r is None: + Delay(config=config) + try: + r = session.get(url=url, params=modify_params(), headers=modify_headers(), allow_redirects=True) + except requests.exceptions.ContentDecodingError as e: + # Workaround for https://fedoraproject.org/w/uploads/5/54/Duffy-f12-banner.svgz + # (see also https://cdn.digitaldragon.dev/wikibot/jobs/b0f52fc3-927b-4d14-aded-89a2795e8d4d/log.txt) + # server response with "Content-Encoding: gzip" (or other) but the transfer is not encoded/compressed actually + # If this workround can't get the original file, the file will be thrown to images_mismatch dir, not too bad :) + log_error( + config, to_stdout=True, + text=f"{e} when downloading {filename_underscore} with URL {url} . " + "Retrying with 'Accept-Encoding: identity' header and no transfer auto-decompresion..." + ) + _headers = modify_headers() + _headers["Accept-Encoding"] = "identity" + r = session.get(url=url, params=modify_params(), headers=_headers, allow_redirects=True, stream=True) + r._content = r.raw.read() + + check_response(r) + + # a trick to get original file (fandom) + ori_url = url + if "fandom.com" in config.api \ + and "static.wikia.nocookie.net" in url \ + and "?" in url \ + and ( + sha1 != NULL and sha1bytes(r.content) != sha1 + or size != NULL and len(r.content) != int(size) + ): + ori_url = url + "&format=original" + Delay(config=config) + r = session.get(url=ori_url, params=modify_params(), headers=modify_headers(), allow_redirects=True) + check_response(r) + + # Try to fix a broken HTTP to HTTPS redirect + original_url_redirected: bool = r.url in (url, ori_url) + if r.status_code == 404 and original_url_redirected: + print(f"WARNING: {url} broken (404), trying to fix it...") + if ( + url_raw.startswith("http://") + and url.startswith("https://") + ): + url = "https://" + url_raw.split("://")[1] + # print 'Maybe a broken http to https redirect, trying ', url + r = session.get(url=url, params=modify_params(), headers=modify_headers(), allow_redirects=True) + check_response(r) + + if r.status_code == 200: + try: + if (sha1 == NULL and size == NULL) \ + or ( + (sha1 == NULL or sha1bytes(r.content) == sha1) + and (size == NULL or len(r.content) == int(size) ) + ): + try: + with open(filepath_underscore, "wb") as imagefile: + imagefile.write(r.content) + except KeyboardInterrupt: + if filepath_underscore.is_file(): + os.remove(filepath_underscore) + raise + delete_mismatch_image(filename_underscore) # delete previous mismatch image + c_savedImageFiles += 1 + else: + if size != NULL and len(r.content) != int(size): + raise FileSizeError(file=filename_underscore, + got_size=len(r.content), + excpected_size=int(size), + online_url=url) + elif sha1bytes(r.content) != sha1: + raise FileSha1Error(file=filename_underscore, excpected_sha1=sha1) + else: + raise RuntimeError("Unknown error") + except OSError: + log_error( + config=config, to_stdout=True, + text=f"File '{filepath_underscore}' could not be created by OS", + ) + continue + except (FileSha1Error, FileSizeError) as e: + log_error( + config=config, to_stdout=True, + text=f"{e}. saving to images_mismatch dir", + ) + with open(images_mismatch_dir / filename_underscore, "wb") as imagefile: + imagefile.write(r.content) + c_savedMismatchImageFiles += 1 + continue + + if timestamp != NULL: + # try to set file timestamp (mtime) + try: + mtime = datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ").timestamp() + atime = os.stat(filepath_underscore).st_atime + # atime is not modified + os.utime(filepath_underscore, times=(atime, mtime)) + # print(atime, mtime) + except Exception as e: + print("Error setting file timestamp:", e) + else: + log_error( + config=config, to_stdout=True, + text=f"Failed to download '{filename_underscore}' with URL '{url}' due to HTTP '{r.status_code}', skipping" + ) + + if not to_download: # skip printing + continue + if STDOUT_IS_TTY: + print_msg = f" | {len(images)}=>{filename_underscore[0:50]}" + print(print_msg, " "*(73 - len(print_msg)), end="\r") + else: + print(f'{len(images)}=>{filename_underscore}') + + # NOTE: len(images) == 0 here + + patch_sess.release() + print(f"Downloaded {c_savedImageFiles} files to 'images' dir") + print(f"Downloaded {c_savedMismatchImageFiles} files to 'images_mismatch' dir") + if other.ia_wbm_booster and c_wbm_speedup_files: + print(f"(WBM speedup: {c_wbm_speedup_files} files)") + + + @staticmethod + def get_image_names(config: Config, session: requests.Session): + """Get list of image names""" + + print(")Retrieving image filenames") + images = [] + if config.api: + print("Using API to retrieve image names...") + images = Image.get_image_names_API(config=config, session=session) + elif config.index: + print("Using index.php (Special:Imagelist) to retrieve image names...") + images = Image.get_image_names_scraper(config=config, session=session) + + print(f"Sorting image filenames ({len(images)} images)...") + images.sort() + print("Done") + + return images + + + @staticmethod + def get_image_names_scraper(config: Config, session: requests.Session): + """Retrieve file list: filename, url, uploader""" + + images = [] + limit = 5000 + retries = config.retries + offset = None + while offset or len(images) == 0: + # 5000 overload some servers, but it is needed for sites like this with + # no next links + # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= + params = {"title": "Special:Imagelist", "limit": limit, "dir": "prev", "offset": offset} + r = session.post( + url=config.index, + params=params, + timeout=30, + ) + raw = r.text + Delay(config=config) + # delicate wiki + if re.search( + r"(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)", + raw, + ): + if limit > 10: + print(f"Error: listing {limit} images in a chunk is not possible, trying tiny chunks") + limit = limit // 10 + continue + elif retries > 0: # waste retries, then exit + retries -= 1 + print("Retrying...") + continue + else: + raise RuntimeError("retries exhausted") + + raw = clean_HTML(raw) + + # Select the regexp that returns more results + best_matched = 0 + regexp_best = None + for regexp in REGEX_CANDIDATES: + _count = len(re.findall(regexp, raw)) + if _count > best_matched: + best_matched = _count + regexp_best = regexp + assert regexp_best is not None, "Could not find a proper regexp to parse the HTML" + m = re.compile(regexp_best).finditer(raw) + + # Iter the image results + for i in m: + url = i.group("url") + url = Image.curate_image_URL(config=config, url=url) + + filename = i.group("filename") + filename = undo_HTML_entities(text=filename) + filename = urllib.parse.unquote(filename) + + uploader = i.group("uploader") + uploader = undo_HTML_entities(text=uploader) + uploader = urllib.parse.unquote(uploader) + + # timestamp = i.group("timestamp") + # print(" %s" % (timestamp)) + + size = NULL # size not accurate + sha1 = NULL # sha1 not available + timestamp = NULL # date formats are difficult to parse + images.append([ + underscore(filename), url, space(uploader), + size, sha1, timestamp, + ]) + # print (filename, url) + + if re.search(R_NEXT, raw): + new_offset = re.findall(R_NEXT, raw)[0] + # Avoid infinite loop + if new_offset != offset: + offset = new_offset + retries += 5 # add more retries if we got a page with offset + else: + print("Warning: offset is not changing") + offset = "" + else: + print("INFO: no next link found, we may have reached the end") + offset = "" + + if len(images) == 0: + print("Warning: no images found") + elif len(images) == limit: + print(f"Warning: the number of images is equal to the limit parameter ({limit}), there may be more images") + else: + print(f" Found {len(images)} images") + + images.sort() + return images + + @staticmethod + def get_image_names_API(config: Config, session: requests.Session): + """Retrieve file list: filename, url, uploader, size, sha1""" + use_oldAPI = False + # # Commented by @yzqzss: + # https://www.mediawiki.org/wiki/API:Allpages + # API:Allpages requires MW >= 1.8 + # API:Allimages requires MW >= 1.13 + + aifrom = "!" + images = [] + countImages = 0 + while aifrom: + print(f'Using API:Allimages to get the list of images, {len(images)} images found so far...', end='\r') + params = { + "action": "query", + "list": "allimages", + "aiprop": "url|user|size|sha1|timestamp", + "aifrom": aifrom, + "format": "json", + "ailimit": config.api_chunksize, + } + # FIXME Handle HTTP Errors HERE + r = session.get(url=config.api, params=params, timeout=30) + handle_StatusCode(r) + jsonimages = get_JSON(r) + Delay(config=config) + + if "query" in jsonimages: + countImages += len(jsonimages["query"]["allimages"]) + + # oldAPI = True + # break + # # uncomment to force use API:Allpages generator + # # may also can as a fallback if API:Allimages response is wrong + + aifrom = "" + if ( + "query-continue" in jsonimages + and "allimages" in jsonimages["query-continue"] + ): + if "aicontinue" in jsonimages["query-continue"]["allimages"]: + aifrom = jsonimages["query-continue"]["allimages"]["aicontinue"] + elif "aifrom" in jsonimages["query-continue"]["allimages"]: + aifrom = jsonimages["query-continue"]["allimages"]["aifrom"] + elif "continue" in jsonimages: + if "aicontinue" in jsonimages["continue"]: + aifrom = jsonimages["continue"]["aicontinue"] + elif "aifrom" in jsonimages["continue"]: + aifrom = jsonimages["continue"]["aifrom"] + print(countImages, aifrom[0:30]+" "*(60-len(aifrom[0:30])),end="\r") + + for image in jsonimages["query"]["allimages"]: + image: Dict + + url = image["url"] + url = Image.curate_image_URL(config=config, url=url) + + filename = image.get("name", None) + if filename is None: + if ( + ".wikia." in config.api or ".fandom.com" in config.api + ): + filename = urllib.parse.unquote( + url.split("/")[-3] + ) + else: + filename = urllib.parse.unquote( + url.split("/")[-1] + ) + + if "%u" in filename: + warnings.warn( + f"Filename {filename} may contains unquoted URL characters, please review it manually. FILENAME: {filename} URL:{url}", + UnicodeWarning, + ) + + uploader = image.get("user", "Unknown") + size: Union[bool,int] = image.get("size", NULL) + + # size or sha1 is not always available (e.g. https://wiki.mozilla.org/index.php?curid=20675) + sha1: Union[bool,str] = image.get("sha1", NULL) + timestamp = image.get("timestamp", NULL) + images.append([underscore(filename), url, space(uploader), size, sha1, timestamp]) + else: + use_oldAPI = True + break + + if use_oldAPI: + print(" API:Allimages not available. Using API:Allpages generator instead.") + gapfrom = "!" + images = [] + while gapfrom: + # Some old APIs doesn't have allimages query + # In this case use allpages (in nm=6) as generator for imageinfo + # Example: + # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6 + # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=! + params = { + "action": "query", + "generator": "allpages", + "gapnamespace": 6, + "gaplimit": config.api_chunksize, # The value must be between 1 and 500. + # TODO: Is it OK to set it higher, for speed? + "gapfrom": gapfrom, + "prop": "imageinfo", + "iiprop": "url|user|size|sha1|timestamp", + "format": "json", + } + # FIXME Handle HTTP Errors HERE + r = session.get(url=config.api, params=params, timeout=30) + handle_StatusCode(r) + jsonimages = get_JSON(r) + Delay(config=config) + + if "query" in jsonimages: + countImages += len(jsonimages["query"]["pages"]) + print(countImages, gapfrom[0:30]+" "*(60-len(gapfrom[0:30])),end="\r") + + gapfrom = "" + + # all moden(at 20221231) wikis return 'continue' instead of 'query-continue' + if ( + "continue" in jsonimages + and "gapcontinue" in jsonimages["continue"] + ): + gapfrom = jsonimages["continue"]["gapcontinue"] + + # prior to mw1.21, that raw continuation (query-continue) was the only option. + elif ( + "query-continue" in jsonimages + and "allpages" in jsonimages["query-continue"] + ): + if "gapfrom" in jsonimages["query-continue"]["allpages"]: + gapfrom = jsonimages["query-continue"]["allpages"][ + "gapfrom" + ] + + + # print (gapfrom) + # print (jsonimages['query']) + + for image, props in jsonimages["query"]["pages"].items(): + url = props["imageinfo"][0]["url"] + url = Image.curate_image_URL(config=config, url=url) + + filename = ":".join(props["title"].split(":")[1:]) + + uploader = props["imageinfo"][0]["user"] + size = props.get("imageinfo")[0].get("size", NULL) + sha1 = props.get("imageinfo")[0].get("sha1", NULL) + timestamp = props.get("imageinfo")[0].get("timestamp", NULL) + images.append([underscore(filename), url, space(uploader), size, sha1, timestamp]) + else: + # if the API doesn't return query data, then we're done + break + + if len(images) == 1: + print(" Found 1 image") + else: + print(" Found %d images" % (len(images))) + + return images + + + @staticmethod + def save_image_names(config: Config, other: OtherConfig, images: List[List]): + """Save image list in a file, including filename, url, uploader and other metadata""" + + images_filename = "{}-{}-images.txt".format( + url2prefix_from_config(config=config), config.date + ) + images_file = open( + "{}/{}".format(config.path, images_filename), "w", encoding="utf-8" + ) + + c_images_size = 0 + for line in images: + while 3 <= len(line) < 6: + line.append(NULL) # At this point, make sure all lines have 5 elements + filename, url, uploader, size, sha1, timestamp = line + + assert " " not in filename, "Filename contains space, it should be underscored" + assert "_" not in uploader, "Uploader contains underscore, it should be spaced" + + # print(line,end='\r') + c_images_size += int_or_zero(size) + + images_file.write( + filename + "\t" + url + "\t" + uploader + + "\t" + (str(size) if size else NULL) + + "\t" + (str(sha1) if sha1 else NULL) # sha1 or size may be NULL + + "\t" + (timestamp if timestamp else NULL) + + "\n" + ) + images_file.write("--END--\n") + images_file.close() + + print("Image metadata (images.txt) saved at:", images_filename) + print(f"Estimated size of all images (images.txt): {c_images_size} bytes ({c_images_size/1024/1024/1024:.2f} GiB)") + + try: + assert len(images) <= other.assert_max_images if other.assert_max_images is not None else True + print(f"--assert_max_images: {other.assert_max_images}, passed") + assert c_images_size <= other.assert_max_images_bytes if other.assert_max_images_bytes is not None else True + print(f"--assert_max_images_bytes: {other.assert_max_images_bytes}, passed") + except AssertionError: + import traceback + traceback.print_exc() + sys.exit(45) + + + @staticmethod + def curate_image_URL(config: Config, url: str): + """Returns an absolute URL for an image, adding the domain if missing""" + + if config.index: + # remove from :// (http or https) until the first / after domain + domainalone = ( + config.index.split("://")[0] + + "://" + + config.index.split("://")[1].split("/")[0] + ) + elif config.api: + domainalone = ( + config.api.split("://")[0] + + "://" + + config.api.split("://")[1].split("/")[0] + ) + else: + print("ERROR: no index nor API") + sys.exit(1) + return # useless but linting is happy + + if url.startswith("//"): # Orain wikifarm returns URLs starting with // + url = "{}:{}".format(domainalone.split("://")[0], url) + # is it a relative URL? + elif url[0] == "/" or ( + not url.startswith("http://") and not url.startswith("https://") + ): + if url[0] == "/": # slash is added later + url = url[1:] + # concat http(s) + domain + relative url + url = f"{domainalone}/{url}" + url = undo_HTML_entities(text=url) + # url = urllib.parse.unquote(url) #do not use unquote with url, it break some + # urls with odd chars + + return underscore(url) diff --git a/wikiteam3/dumpgenerator/dump/misc/__init__.py b/wikiteam3/dumpgenerator/dump/misc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wikiteam3/dumpgenerator/dump/misc/index_php.py b/wikiteam3/dumpgenerator/dump/misc/index_php.py new file mode 100644 index 00000000..332f5a6d --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/misc/index_php.py @@ -0,0 +1,25 @@ +import os + +from wikiteam3.dumpgenerator.cli import Delay +from wikiteam3.utils import remove_IP +from wikiteam3.dumpgenerator.config import Config + +def save_IndexPHP(config: Config, session): + """Save index.php as .html, to preserve license details available at the botom of the page""" + + assert config.index + + if os.path.exists("%s/index.html" % (config.path)): + print("index.html exists, do not overwrite") + else: + print("Downloading index.php (Main Page) as index.html") + try: + r = session.post(url=config.index, params=None, timeout=10) + raw = str(r.text) + except Exception as e: + print("Error: %s" % (e)) + return + Delay(config=config) + raw = remove_IP(raw=raw) + with open("%s/index.html" % (config.path), "w", encoding="utf-8") as outfile: + outfile.write(raw) diff --git a/wikiteam3/dumpgenerator/dump/misc/site_info.py b/wikiteam3/dumpgenerator/dump/misc/site_info.py new file mode 100644 index 00000000..65055f86 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/misc/site_info.py @@ -0,0 +1,84 @@ +import json +import os +import sys +from typing import Optional + +import requests + +from wikiteam3.dumpgenerator.cli import Delay +from wikiteam3.dumpgenerator.api import get_JSON +from wikiteam3.dumpgenerator.config import Config, OtherConfig + + +def save_siteinfo(config: Config, session: requests.Session): + if os.path.exists("%s/siteinfo.json" % (config.path)): + print("siteinfo.json exists, do not overwrite") + return + + print("Downloading site info as siteinfo.json") + + result = get_siteinfo(config, session) + with open( + "%s/siteinfo.json" % (config.path), "w", encoding="utf-8" + ) as outfile: + outfile.write(json.dumps(result, indent=4, sort_keys=True, ensure_ascii=False)) + Delay(config=config) + + +def assert_siteinfo(result, other: OtherConfig): + """ assert_max_edits, assert_max_pages, assert_max_images """ + + stats = result["query"]["statistics"] if "query" in result else result["statistics"] + + try: + assert stats["pages"] <= other.assert_max_pages if other.assert_max_pages is not None else True + assert stats["images"] <= other.assert_max_images if other.assert_max_images is not None else True + assert stats["edits"] <= other.assert_max_edits if other.assert_max_edits is not None else True + except AssertionError: + import traceback + traceback.print_exc() + sys.exit(45) + + +def get_siteinfo(config: Config, session: requests.Session): + assert config.api + + # MediaWiki 1.13+ + r = session.get( + url=config.api, + params={ + "action": "query", + "meta": "siteinfo", + "siprop": "general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo", + "sinumberingroup": 1, + "format": "json", + }, + timeout=10, + ) + # MediaWiki 1.11-1.12 + if "query" not in get_JSON(r): + r = session.get( + url=config.api, + params={ + "action": "query", + "meta": "siteinfo", + "siprop": "general|namespaces|statistics|dbrepllag|interwikimap", + "format": "json", + }, + timeout=10, + ) + # MediaWiki 1.8-1.10 + if "query" not in get_JSON(r): + r = session.get( + url=config.api, + params={ + "action": "query", + "meta": "siteinfo", + "siprop": "general|namespaces", + "format": "json", + }, + timeout=10, + ) + result = get_JSON(r) + + return result \ No newline at end of file diff --git a/wikiteam3/dumpgenerator/dump/misc/special_logs.py b/wikiteam3/dumpgenerator/dump/misc/special_logs.py new file mode 100644 index 00000000..97a45306 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/misc/special_logs.py @@ -0,0 +1,23 @@ +from wikiteam3.dumpgenerator.config import Config +from wikiteam3.dumpgenerator.cli import Delay + +def save_SpecialLog(config: Config, session=None): + """Save Special:Log""" + # get all logs from Special:Log + """parse + +""" + raise NotImplementedError() # TODO + Delay(config=config) diff --git a/wikiteam3/dumpgenerator/dump/misc/special_version.py b/wikiteam3/dumpgenerator/dump/misc/special_version.py new file mode 100644 index 00000000..e5abc028 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/misc/special_version.py @@ -0,0 +1,33 @@ +import os + +import requests + +from wikiteam3.dumpgenerator.cli import Delay +from wikiteam3.utils import remove_IP +from wikiteam3.dumpgenerator.config import Config + + +def save_SpecialVersion(config: Config, session: requests.Session): + """Save Special:Version as .html, to preserve extensions details""" + + assert config.index + + if os.path.exists("%s/SpecialVersion.html" % (config.path)): + print("SpecialVersion.html exists, do not overwrite") + else: + print("Downloading Special:Version with extensions and other related info") + try: + r = session.post( + url=config.index, params={"title": "Special:Version"}, timeout=10 + ) + except Exception as e: + print("Error: %s" % (e)) + return + raw = r.text + Delay(config=config) + raw = remove_IP(raw=raw) + with open( + "%s/SpecialVersion.html" % (config.path), "w", encoding="utf-8" + ) as outfile: + outfile.write(raw) + diff --git a/wikiteam3/dumpgenerator/dump/page/__init__.py b/wikiteam3/dumpgenerator/dump/page/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/__init__.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py new file mode 100644 index 00000000..c412f01a --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py @@ -0,0 +1,12 @@ +import requests +from wikiteam3.dumpgenerator.config import Config +from .page_xml_api import getXMLPageWithApi +from .page_xml_export import getXMLPageWithExport + + +def get_XML_page(config: Config, title="", + *, verbose=True, session: requests.Session): + if config.xmlapiexport: + return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session) + else: + return getXMLPageWithExport(config=config, title=title, verbose=verbose, session=session) diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py new file mode 100644 index 00000000..fa8631c5 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py @@ -0,0 +1,306 @@ +import re +import time +import traceback +from typing import Dict, Optional +import xml.etree.ElementTree as ET +import xml.dom.minidom as MD + +import requests + +from wikiteam3.dumpgenerator.api import handle_StatusCode +from wikiteam3.dumpgenerator.config import Config +from wikiteam3.dumpgenerator.exceptions import PageMissingError, ExportAbortedError +from wikiteam3.dumpgenerator.log import log_error +from wikiteam3.utils.util import underscore + + +def reconstructRevisions(root: ET.Element): + #print ET.tostring(rev) + page = ET.Element('stub') + edits = 0 + + if root.find('query').find('pages').find('page').find('revisions') is None: # type: ignore + # case: https://wiki.archlinux.org/index.php?title=Arabic&action=history + # No matching revisions were found. + print('!!! No revisions found in page !!!') + return page, edits # no revisions + + for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'): # type: ignore + try: + rev_ = ET.SubElement(page,'revision') + # id + ET.SubElement(rev_,'id').text = rev.attrib['revid'] + # parentid (optional, export-0.7+, positiveInteger) + if 'parentid' in rev.attrib and int(rev.attrib['parentid']) > 0: + ET.SubElement(rev_,'parentid').text = rev.attrib['parentid'] + # timestamp + ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp'] + # contributor + contributor = ET.SubElement(rev_,'contributor') + if 'userhidden' not in rev.attrib: + ET.SubElement(contributor,'username').text = rev.attrib['user'] + ET.SubElement(contributor,'id').text = rev.attrib['userid'] + else: + contributor.set('deleted','deleted') + # comment (optional) + if 'commenthidden' in rev.attrib: + print('commenthidden') + comment = ET.SubElement(rev_,'comment') + comment.set('deleted','deleted') + elif 'comment' in rev.attrib and rev.attrib['comment']: # '' is empty + comment = ET.SubElement(rev_,'comment') + comment.text = rev.attrib['comment'] + else: + # no comment or empty comment, do not create comment element + pass + + # minor edit (optional) + if 'minor' in rev.attrib: + ET.SubElement(rev_,'minor') + # model and format (optional, export-0.8+) + if 'contentmodel' in rev.attrib: + ET.SubElement(rev_,'model').text = rev.attrib['contentmodel'] # default: 'wikitext' + if 'contentformat' in rev.attrib: + ET.SubElement(rev_,'format').text = rev.attrib['contentformat'] # default: 'text/x-wiki' + # text + text = ET.SubElement(rev_,'text') + if 'texthidden' not in rev.attrib: + text.attrib['xml:space'] = "preserve" + text.attrib['bytes'] = rev.attrib['size'] + text.text = rev.text + else: + # NOTE: this is not the same as the text being empty + text.set('deleted','deleted') + # sha1 + if 'sha1' not in rev.attrib: + if 'sha1hidden' in rev.attrib: + ET.SubElement(rev_,'sha1') # stub + else: + # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia). + pass + elif 'sha1' in rev.attrib: + sha1 = ET.SubElement(rev_,'sha1') + sha1.text = rev.attrib['sha1'] + + edits += 1 + except Exception as e: + #logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev))) + print(ET.tostring(rev)) + traceback.print_exc() + page = None + edits = 0 + raise e + return page,edits + +def getXMLPageCoreWithApi(config: Config, session: requests.Session, params: Dict, headers: Optional[Dict]=None): + """ """ + # just send the API request + # if it fails, it will reduce params['rvlimit'] + xml = '' + c = 0 + maxseconds = 100 # max seconds to wait in a single sleeping + maxretries = config.retries # x retries and skip + increment = 20 # increment every retry + + while not re.search(r'' if not config.curonly else r'', xml) or re.search(r'', xml): + if c > 0 and c < maxretries: + wait = increment * c < maxseconds and increment * \ + c or maxseconds # incremental until maxseconds + print(' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' % ( + c, params['titles' if config.xmlapiexport else 'pages'], wait)) + time.sleep(wait) + # reducing server load requesting smallest chunks (if curonly then + # rvlimit = 1 from mother function) + if params['rvlimit'] > 1: + rvlimit = int(params['rvlimit'] / 2) # half + params['rvlimit'] = rvlimit if rvlimit > 1 else 1 + print(' We have retried %d times' % (c)) + print(' MediaWiki error for "%s", network error or whatever...' % ( + params['titles' if config.xmlapiexport else 'pages'])) + # If it's not already what we tried: our last chance, preserve only the last revision... + # config.curonly means that the whole dump is configured to save only the last, + # params['curonly'] should mean that we've already tried this + # fallback, because it's set by the following if and passed to + # getXMLPageCore + # TODO: save only the last version when failed + print(' Saving in the errors log, and skipping...') + log_error( + config=config, + text='Error while retrieving the last revision of "%s". Skipping.' % + (params['titles' if config.xmlapiexport else 'pages'])) + raise ExportAbortedError(config.index) + return '' # empty xml + + # FIXME HANDLE HTTP Errors HERE + try: + r = session.get(url=config.api, params=params, headers=headers) + handle_StatusCode(r) + xml = r.text + # print xml + except requests.exceptions.ConnectionError as e: + print(' Connection error: %s' % (str(e.args[0]))) + xml = '' + except requests.exceptions.ReadTimeout as e: + print(" Read timeout: %s" % (str(e.args[0]))) + xml = "" + c += 1 + return xml + + +def getXMLPageWithApi(config: Config, title="", verbose=True, *, session: requests.Session): + """ Get the full history (or current only) of a page using API:Query + if params['curonly'] is set, then using export&exportwrap to export + """ + + title_ = underscore(title) + # do not convert & into %26, title_ = re.sub('&', '%26', title_) + # action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE + # &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize + # print 'current:%s' % (title_) + if not config.curonly: + params = {'titles': title_, 'action': 'query', 'format': 'xml', + 'prop': 'revisions', + 'rvprop': # rvprop: + '|'.join([ + 'timestamp', 'user', 'comment', 'content', # MW v???? + 'ids', 'flags', 'size', # MW v1.11 + 'userid', # MW v1.17 + 'sha1', # MW v1.19 + 'contentmodel' # MW v1.21 + ]), + 'rvcontinue': None, + 'rvlimit': config.api_chunksize + } + else: + params = {'titles': title_, 'action': 'query', 'format': 'xml', 'export': 1, 'exportnowrap': 1} + # print 'params:%s' % (params) + if not config.curonly: + firstpartok = False + lastcontinue = None + numberofedits = 0 + ret = '' + continueKey: Optional[str] = None + + retries_left = config.retries + while True: + if retries_left <= 0: + raise RuntimeError("Retries exceeded") + # in case the last request is not right, saving last time's progress + if not firstpartok: + lastcontinue = params.get(continueKey, None) if continueKey is not None else None + + xml = getXMLPageCoreWithApi(params=params, config=config, session=session) + if xml == "": + # just return so that we can continue, and getXMLPageCoreWithApi will log the error + return + try: + root = ET.fromstring(xml.encode('utf-8')) + except Exception as e: + retries_left -= 1 + traceback.print_exc() + print("Retrying...") + continue + try: + retpage = root.find('query').find('pages').find('page') # type: ignore + except Exception: + retries_left -= 1 + traceback.print_exc() + print("Retrying...") + continue + + assert retpage is not None, "Should have a page" + + if 'missing' in retpage.attrib or 'invalid' in retpage.attrib: + print('Page not found') + raise PageMissingError(params['titles'], xml) + if not firstpartok: + try: + # build the firstpart by ourselves to improve the memory usage + ret = ' \n' + ret += ' %s\n' % (retpage.attrib['title']) + ret += ' %s\n' % (retpage.attrib['ns']) + ret += ' %s\n' % (retpage.attrib['pageid']) + except Exception: + firstpartok = False + retries_left -= 1 + traceback.print_exc() + print("Retrying...") + continue + else: + firstpartok = True + yield ret + + # find the continue key + continueVal = None + if root.find('continue') is not None: + # uses continue.rvcontinue + # MW 1.26+ + continueKey = 'rvcontinue' + continueVal = root.find('continue').attrib['rvcontinue'] # type: ignore + elif root.find('query-continue') is not None: + revContinue = root.find('query-continue').find('revisions') # type: ignore + assert revContinue is not None, "Should only have revisions continue" + if 'rvcontinue' in revContinue.attrib: + # MW 1.21 ~ 1.25 + continueKey = 'rvcontinue' + continueVal = revContinue.attrib['rvcontinue'] + elif 'rvstartid' in revContinue.attrib: + # TODO: MW ???? + continueKey = 'rvstartid' + continueVal = revContinue.attrib['rvstartid'] + else: + # blindly assume the first attribute is the continue key + # may never happen + assert len(revContinue.attrib) > 0, "Should have at least one attribute" + for continueKey in revContinue.attrib.keys(): + continueVal = revContinue.attrib[continueKey] + break + if continueVal is not None: + params[continueKey] = continueVal + + # build the revision tags + try: + ret = '' + edits = 0 + + # transform the revision + rev_, edits = reconstructRevisions(root=root) + numberofedits += edits + xmldom = MD.parseString(b'' + ET.tostring(rev_) + b'') + # convert it into text in case it throws MemoryError + # delete the first three line and last two line,which is for setting the indent + ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2]) + yield ret + if config.curonly or continueVal is None: # no continue + break + except Exception: + retries_left -= 1 + traceback.print_exc() + print("Retrying...") + params['rvcontinue'] = lastcontinue + ret = '' + yield ' \n' + if numberofedits == 0: + raise PageMissingError(title=title_, xml=xml) + else: # curonly + xml = getXMLPageCoreWithApi(params=params, config=config, session=session) + if xml == "": + raise ExportAbortedError(config.index) + if "" not in xml: + raise PageMissingError(title_, xml) + + yield xml.split("")[0] + + # just for looking good :) + r_timestamp = r'([^<]+)' + + numberofedits = 0 + numberofedits += len(re.findall(r_timestamp, xml)) + + yield "\n" + + if verbose: + if (numberofedits == 1): + print(' %s, 1 edit' % (title.strip())) + else: + print(' %s, %d edits' % (title.strip(), numberofedits)) diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py new file mode 100644 index 00000000..93d9ef54 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py @@ -0,0 +1,213 @@ +import os +import re +import sys +import time +from typing import Any, Dict, Generator + +import requests + +from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingError +from wikiteam3.dumpgenerator.api import handle_StatusCode +from wikiteam3.dumpgenerator.log import log_error +from wikiteam3.dumpgenerator.config import Config +from wikiteam3.utils.util import underscore + + +HISTORY_MIN_CHUNKSIZE = 2 +""" To loop over all the revisions, we need to retrieve at least 2 revisions at a time. """ +MAX_SECONDS = 100 +""" max seconds to wait in a single sleeping. """ + +def getXMLPageCore(params: Dict, config: Config, session: requests.Session) -> str: + """ + returns a XML containing params['limit'] revisions (or current only), ending in + if retrieving params['limit'] revisions fails, returns a current only version + if all fail, returns the empty string + """ + assert "pages" in params, "pages not in params" + assert "limit" in params, "limit not in params" + + xml = "" + c = 0 + maxretries = config.retries # x retries and skip + increment_delay = max(config.delay, 1.0) + + while not re.search(r"", xml): + if c > 0 and (c < maxretries or params["limit"] > HISTORY_MIN_CHUNKSIZE): + delay = min(increment_delay * c, MAX_SECONDS) # incremental until MAX_SECONDS + print( + f' In attempt {c}, XML for "{params["pages"]}" is wrong. Waiting {delay} seconds and reloading...' + ) + time.sleep(delay) + # reducing server load requesting smallest chunks (if curonly then + # limit = 1 from mother function) + if params["limit"] > 1: + # NOTE: if limit is float and betwennt 0 to 1, the MW backend will force-int it to 0 + new_limit: int = params["limit"] // 2 # half + if new_limit < HISTORY_MIN_CHUNKSIZE: + new_limit = HISTORY_MIN_CHUNKSIZE + + assert new_limit >= HISTORY_MIN_CHUNKSIZE, f"new_limit: {new_limit} < {HISTORY_MIN_CHUNKSIZE}" + + # set new limit + if new_limit != params["limit"]: + print( + f' Reducing the chunksize of revisions to retrieve from {params["limit"]} to {new_limit}' + ) + params["limit"] = new_limit + if c >= maxretries: + print(" We have retried %d times" % (c)) + print( + ' MediaWiki error for "%s", network error or whatever...' + % (params["pages"]) + ) + if config.failfast: + print("Exit, it will be for another time") + sys.exit(1) + # If it's not already what we tried: our last chance, preserve only the last revision... + # config.curonly means that the whole dump is configured to save only the last, + # params['curonly'] should mean that we've already tried this + # fallback, because it's set by the following if and passed to + # getXMLPageCore + if not config.curonly and "curonly" not in params: + print(" Trying to save only the last revision for this page...") + params["curonly"] = 1 + log_error( + config=config, to_stdout=True, + text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' + % (params["pages"]), + ) + return getXMLPageCore( + params=params, config=config, session=session + ) + else: + print(" Saving in the errors log, and skipping...") + log_error( + config=config, to_stdout=True, + text='Error while retrieving the last revision of "%s". Skipping.' + % (params["pages"]), + ) + raise ExportAbortedError(config.index) + return "" # empty xml + # FIXME HANDLE HTTP Errors HERE + try: + r = session.post( + url=config.index, params=params, timeout=120 + ) + handle_StatusCode(r) + xml = r.text + except requests.exceptions.ConnectionError as e: + print(" Connection error: %s" % (str(e.args[0]))) + xml = "" + except requests.exceptions.ReadTimeout as e: + print(" Read timeout: %s" % (str(e.args[0]))) + xml = "" + c += 1 + + return xml + + +def getXMLPageWithExport(config: Config, title: str, + *, verbose=True, session: requests.Session + ) -> Generator[str, None, None]: + """Get the full history (or current only) of a page""" + + # if server errors occurs while retrieving the full page history, + # it may return [oldest OK versions] + last version, excluding middle revisions, + # so it would be partialy truncated + # http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F + + PARAM_LIMIT = int(os.getenv("PARAM_XML_LIMIT", 1000)) + truncated = False + title_ = underscore(title) + # do not convert & into %26, title_ = re.sub('&', '%26', title_) + + params: Dict[str, Any] + if config.export: + params = {"title": config.export, "pages": title_, "action": "submit"} + else: + params = {"title": "Special:Export", "pages": title_, "action": "submit"} + if config.curonly: + params["curonly"] = 1 + params["limit"] = 1 + else: + params["offset"] = "1" # 1 always < 2000s + params["limit"] = PARAM_LIMIT + # in other case, do not set params['templates'] + if config.templates: + params["templates"] = 1 + + xml = getXMLPageCore(params=params, config=config, session=session) + if xml == "": + raise ExportAbortedError(config.index) + if "" not in xml: + raise PageMissingError(params["title"], xml) + + + yield xml.split("")[0] + + # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available + # else, warning about Special:Export truncating large page histories + r_timestamp = r"([^<]+)" + + edit_count = 0 + edit_count += len(re.findall(r_timestamp, xml)) + + # search for timestamps in xml to avoid analysing empty pages like + # Special:Allpages and the random one + if not config.curonly and re.search(r_timestamp, xml): + while not truncated and params["offset"]: # next chunk + # get the last timestamp from the acum XML + params["offset"] = re.findall(r_timestamp, xml)[-1] + try: + xml2 = getXMLPageCore(params=params, config=config, session=session) + except MemoryError: + print("The page's history exceeds our memory, halving limit.") + params["limit"] = params["limit"] / 2 + continue + + # are there more edits in this next XML chunk or no ? + if re.findall(r_timestamp, xml2): + if re.findall(r_timestamp, xml2)[-1] == params["offset"]: + # again the same XML, this wiki does not support params in + # Special:Export, offer complete XML up to X edits (usually + # 1000) + print( + "ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated" + ) + truncated = True + break + else: + """ + + + Main Page + 15580374 + edit=sysop:move=sysop (?) + + 418009832 + 2011-03-09T19:57:06Z + + """ + # offset is OK in this wiki, merge with the previous chunk + # of this page history and continue + try: + xml2 = xml2.split("")[0] + yield " " + ( + "".join(xml2.split("")[1:]) + ) + except MemoryError: + "The page's history exceeds our memory, halving limit." + params["limit"] = params["limit"] / 2 + continue + xml = xml2 + edit_count += len(re.findall(r_timestamp, xml)) + else: + params["offset"] = "" # no more edits in this page history + yield "\n" + + if verbose: + if edit_count == 1: + print(" %s, 1 edit" % (title.strip())) + else: + print(" %s, %d edits" % (title.strip(), edit_count)) diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/__init__.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py new file mode 100644 index 00000000..dbdd3451 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py @@ -0,0 +1,511 @@ +from datetime import datetime +import os +import sys +import time +from typing import Dict, List, Optional +from urllib.parse import urlparse +import lxml.etree + +import mwclient +import mwclient.errors +import requests + +from wikiteam3.dumpgenerator.cli.delay import Delay +from wikiteam3.dumpgenerator.exceptions import MWUnknownContentModelException, PageMissingError +from wikiteam3.dumpgenerator.log import log_error +from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI +from wikiteam3.dumpgenerator.api.page_titles import read_titles +from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import \ + make_xml_from_page, make_xml_page_from_raw +from wikiteam3.dumpgenerator.config import Config +from wikiteam3.utils.util import ALL_NAMESPACE_FLAG, XMLRIVISIONS_INCREMENTAL_DUMP_MARK, mark_as_done + +__ALL_NAMESPACE = -20241122 +""" magic number refers to ALL_NAMESPACE_FLAG """ + +def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, site: mwclient.Site, nscontinue=None, arvcontinue: Optional[str]=None): + if ALL_NAMESPACE_FLAG not in config.namespaces: + namespaces = config.namespaces + else: + # namespaces, namespacenames = getNamespacesAPI(config=config, session=session) + namespaces = [__ALL_NAMESPACE] + + # <- increasement xmldump + if env_arvcontinue := os.getenv("ARVCONTINUE", None): + mark_as_done(config, XMLRIVISIONS_INCREMENTAL_DUMP_MARK) + print(f"Using [env]ARVCONTINUE={env_arvcontinue}") + arvcontinue = env_arvcontinue + print("\n\n[NOTE] DO NOT use wikiteam3uploader to upload incremental xmldump to Internet Archive, we haven't implemented it yet\n\n") + # -> + + _nscontinue_input = nscontinue + _arvcontinue_input = arvcontinue + del nscontinue + del arvcontinue + + for namespace in namespaces: + # Skip retrived namespace + if namespace == __ALL_NAMESPACE: + assert len(namespaces) == 1, \ + "Only one item shoule be there when 'all' namespace are specified" + _nscontinue_input = None + else: + if _nscontinue_input is not None: + if namespace != _nscontinue_input: + print("Skipping already exported namespace: %d" % namespace) + continue + _nscontinue_input = None + + print("Trying to export all revisions from namespace %s" % namespace) + # arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!) + arv_params = { + "action": "query", + "list": "allrevisions", + "arvlimit": config.api_chunksize, + "arvdir": "newer", + } + if namespace != __ALL_NAMESPACE: + arv_params['arvnamespace'] = namespace + if _arvcontinue_input is not None: + arv_params['arvcontinue'] = _arvcontinue_input + + if not config.curonly: + # We have to build the XML manually... + # Skip flags, presumably needed to add which is in the schema. + # Also missing: parentid and contentformat. + ARV_PROP = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags" + arv_params[ + "arvprop" + ] = ARV_PROP + print( + "Trying to get wikitext from the allrevisions API and to build the XML" + ) + while True: + print("[arvcontinue]:", arv_params.get("arvcontinue", "")) + try: + allrevs_response = site.api( + http_method=config.http_method, **arv_params + ) + # reset params if the response is OK + arv_params["arvprop"] = ARV_PROP + if arv_params["arvlimit"] != config.api_chunksize: + arv_params["arvlimit"] = min(arv_params["arvlimit"] * 2, config.api_chunksize) + print(f"INFO: response is OK, increasing arvlimit to {arv_params['arvlimit']}") + except mwclient.errors.APIError as e: + if e.code == MWUnknownContentModelException.error_code: + if arv_params['arvlimit'] != 1: + # let's retry with arvlimit=1 to retrieve good revisions as much as possible + print("WARNING: API returned MWUnknownContentModelException. retrying with arvlimit=1 (revision by revision)") + arv_params["arvlimit"] = 1 + Delay(config=config) + continue + elif '|content' in arv_params["arvprop"]: + log_error(config=config, to_stdout=True, + text=f"ERROR: API returned MWUnknownContentModelException on arvcontinue={arv_params.get('arvcontinue', '')}, " + + "retried with arvlimit=1 and still failed. retrying without arvprop=content. " + + '(wikiteam3 would mark the revision as " in the xmldump)' + ) + arv_params["arvprop"] = ARV_PROP.replace('|content', '') + Delay(config=config) + continue + else: + assert False, "This should not happen" + else: + raise + + except requests.exceptions.HTTPError as e: + if ( + e.response.status_code == 405 + and config.http_method == "POST" + ): + print("POST request to the API failed, retrying with GET") + config.http_method = "GET" + Delay(config=config) + continue + else: + raise + except requests.exceptions.ReadTimeout as err: + # Hopefully temporary, just wait a bit and continue with the same request. + # No point putting a limit to retries, we'd need to abort everything. + # TODO: reuse the retry logic of the checkAPI phase? Or force mwclient + # to use the retry adapter we use for our own requests session? + print(f"ERROR: {str(err)}") + print("Sleeping for 20 seconds") + time.sleep(20) + continue + except mwclient.errors.InvalidResponse as e: + if ( + e.response_text.startswith("") # type: ignore + and config.http_method == "POST" + ): + print("POST request to the API failed (got HTML), retrying with GET") + config.http_method = "GET" + Delay(config=config) + continue + else: + raise + + for page in allrevs_response["query"]["allrevisions"]: + yield make_xml_from_page(page, arv_params.get("arvcontinue", "")) + + # find the continue parameter + if "continue" in allrevs_response: + # handle infinite loop + if arv_params.get("arvcontinue", None) == allrevs_response["continue"]["arvcontinue"]: + allrevs_response = handle_infinite_loop( + allrevs_response=allrevs_response, arv_params=arv_params, config=config, site=site + ) + # update continue parameter + arv_params["arvcontinue"] = allrevs_response["continue"]["arvcontinue"] + else: + # End of continuation. We are done with this namespace. + break + + else: # curonly + # FIXME: this is not curonly, just different strategy to do all revisions + # Just cycle through revision IDs and use the XML as is + print("Trying to list the revisions and to export them one by one") + # We only need the revision ID, all the rest will come from the raw export + arv_params["arvprop"] = "ids" + try: + allrevs_response = site.api( + http_method=config.http_method, **arv_params + ) + except requests.exceptions.HTTPError as e: + if ( + e.response.status_code == 405 + and config.http_method == "POST" + ): + print("POST request to the API failed, retrying with GET") + config.http_method = "GET" + raise NotImplementedError("FIXME: here we should retry the same namespace") + continue # FIXME: here we should retry the same namespace + else: + raise + export_params = { + "action": "query", + "export": "1", + } + # Skip the namespace if it's empty + if len(allrevs_response["query"]["allrevisions"]) < 1: + # TODO: log this + continue + # Repeat the arvrequest with new arvparams until done + while True: + # Reset revision IDs from the previous batch from arv + revids: List[str] = [] + for page in allrevs_response["query"]["allrevisions"]: + for revision in page["revisions"]: + revids.append(str(revision["revid"])) + print( + " %d more revisions listed, until %s" + % (len(revids), revids[-1]) + ) + + # We can now get the XML for one revision at a time + # FIXME: we can actually get them in batches as we used to + # but need to figure out the continuation and avoid that the API + # chooses to give us only the latest for each page + for revid in revids: + export_params["revids"] = revid + try: + export_response = site.api( + http_method=config.http_method, **export_params + ) + except requests.exceptions.HTTPError as e: + if ( + e.response.status_code == 405 + and config.http_method == "POST" + ): + print( + "POST request to the API failed, retrying with GET" + ) + config.http_method = "GET" + export_response = site.api( + http_method=config.http_method, **export_params + ) + else: + raise + + # This gives us a self-standing element + # but we only need the inner : we can live with + # duplication and non-ordering of page titles, but the + # repeated header is confusing and would not even be valid + xml: str = export_response["query"]["export"]["*"] + yield make_xml_page_from_raw(xml, arv_params.get("arvcontinue", "")) + + if "continue" in allrevs_response: + # Get the new ones + # NOTE: don't need to handle infinite loop here, because we are only getting the revids + + arv_params["arvcontinue"] = allrevs_response["continue"]["arvcontinue"] + try: + allrevs_response = site.api( + http_method=config.http_method, **arv_params + ) + except requests.exceptions.HTTPError as e: + if ( + e.response.status_code == 405 + and config.http_method == "POST" + ): + print( + "POST request to the API failed, retrying with GET" + ) + config.http_method = "GET" + allrevs_response = site.api( + http_method=config.http_method, **arv_params + ) + except requests.exceptions.ReadTimeout as err: + # As above + print(f"ERROR: {str(err)}") + print("Sleeping for 20 seconds") + time.sleep(20) + # But avoid rewriting the same revisions + allrevs_response["query"]["allrevisions"] = [] + continue + else: + # End of continuation. We are done with this namespace. + break + + +def getXMLRevisionsByTitles(config: Config, session: requests.Session, site: mwclient.Site, start=None): + if config.curonly: + # The raw XML export in the API gets a title and gives the latest revision. + # We could also use the allpages API as generator but let's be consistent. + print("Getting titles to export the latest revision for each") + c = 0 + for title in read_titles(config, session=session, start=start): + # TODO: respect verbose flag, reuse output from getXMLPage + print(f" {title}") + # TODO: as we're doing one page and revision at a time, we might + # as well use xml format and exportnowrap=1 to use the string of, + # XML as is, but need to check how well the library handles it. + exportparams = { + "action": "query", + "titles": title, + "export": "1", + } + try: + export_response = site.api( + http_method=config.http_method, **exportparams + ) + except requests.exceptions.HTTPError as e: + if ( + e.response.status_code == 405 + and config.http_method == "POST" + ): + print("POST request to the API failed, retrying with GET") + config.http_method = "GET" + export_response = site.api( + http_method=config.http_method, **exportparams + ) + else: + raise + + xml = str(export_response["query"]["export"]["*"]) + c += 1 + if c % 10 == 0: + print(f"\n-> Downloaded {c} pages\n") + # Because we got the fancy XML from the JSON format, clean it: + yield make_xml_page_from_raw(xml, None) + else: + # This is the closest to what we usually do with Special:Export: + # take one title at a time and try to get all revisions exported. + # It differs from the allrevisions method because it actually needs + # to be input the page titles; otherwise, the requests are similar. + # The XML needs to be made manually because the export=1 option + # refuses to return an arbitrary number of revisions (see above). + print("Getting titles to export all the revisions of each") + c = 0 + titlelist = [] + # TODO: Decide a suitable number of a batched request. Careful: + # batched responses may not return all revisions. + for title in read_titles(config, session=session, start=start): + print(f" {title}") + titlelist = [title] + # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded: + # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}} + pparams = { + "action": "query", + "titles": "|".join(titlelist), + "prop": "revisions", + 'rvlimit': config.api_chunksize, + "rvprop": "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags", + } + try: + api_response = site.api(http_method=config.http_method, **pparams) + except requests.exceptions.HTTPError as e: + if ( + e.response.status_code == 405 + and config.http_method == "POST" + ): + print("POST request to the API failed, retrying with GET") + config.http_method = "GET" + api_response = site.api( + http_method=config.http_method, **pparams + ) + else: + raise + except mwclient.errors.InvalidResponse: + log_error( + config=config, to_stdout=True, + text="Error: page inaccessible? Could not export page: %s" + % ("; ".join(titlelist)), + ) + continue + + # Be ready to iterate if there is continuation. + while True: + # Get the revision data returned by the API: prequest is the initial request + # or the new one after continuation at the bottom of this while loop. + # The array is called "pages" even if there's only one. + try: + pages = api_response["query"]["pages"] + except KeyError: + log_error( + config=config, to_stdout=True, + text="Error: page inaccessible? Could not export page: %s" + % ("; ".join(titlelist)), + ) + break + # Go through the data we got to build the XML. + for pageid in pages: + try: + xml = make_xml_from_page(pages[pageid], None) + yield xml + except PageMissingError: + log_error( + config=config, to_stdout=True, + text="Error: empty revision from API. Could not export page: %s" + % ("; ".join(titlelist)), + ) + continue + + # Get next batch of revisions if there's more. + if "continue" in api_response.keys(): + print("Getting more revisions for the page") + for key, value in api_response["continue"].items(): + pparams[key] = value + elif "query-continue" in api_response.keys(): + rvstartid = api_response["query-continue"]["revisions"]["rvstartid"] + pparams["rvstartid"] = rvstartid + else: + break + + try: + api_response = site.api( + http_method=config.http_method, **pparams + ) + except requests.exceptions.HTTPError as e: + if ( + e.response.status_code == 405 + and config.http_method == "POST" + ): + print("POST request to the API failed, retrying with GET") + config.http_method = "GET" + api_response = site.api( + http_method=config.http_method, **pparams + ) + + # We're done iterating for this title or titles. + c += len(titlelist) + # Reset for the next batch. + titlelist = [] + if c % 10 == 0: + print(f"\n-> Downloaded {c} pages\n") + + +def getXMLRevisions(config: Config, session: requests.Session, lastPage: Optional[lxml.etree._ElementTree]=None, useAllrevision=True): + # FIXME: actually figure out the various strategies for each MediaWiki version + apiurl = urlparse(config.api) + site = mwclient.Site( + apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, pool=session + ) + + if useAllrevision: + # Find last title + if lastPage is not None: + try: + lastNs = int(lastPage.find('ns').text) # type: ignore + if False: + lastRevision = lastPage.find('revision') + lastTimestamp = lastRevision.find('timestamp').text + lastRevid = int(lastRevision.find('id').text) + lastDatetime = datetime.fromisoformat(lastTimestamp.rstrip('Z')) + lastArvcontinue = lastDatetime.strftime("%Y%m%d%H%M%S") + '|' + str(lastRevid) + else: + lastArvcontinue = lastPage.attrib['arvcontinue'] + except Exception: + print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage))) + raise + nscontinue = lastNs + arvcontinue = lastArvcontinue + if not arvcontinue: + arvcontinue = None + else: + nscontinue = None + arvcontinue = None + + try: + return getXMLRevisionsByAllRevisions(config, session, site, nscontinue, arvcontinue) + except (KeyError, mwclient.errors.InvalidResponse) as e: + print(e) + # TODO: check whether the KeyError was really for a missing arv API + print("Warning. Could not use allrevisions. Wiki too old? Try to use --xmlrevisions_page") + sys.exit(1) + else: + # Find last title + if lastPage is not None: + try: + start = lastPage.find('title') # type: ignore + except Exception: + print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage))) + raise + else: + start = None + + try: + # # Uncomment these lines to raise an KeyError for testing + # raise KeyError(999999) + # # DO NOT UNCOMMMENT IN RELEASE + return getXMLRevisionsByTitles(config, session, site, start) + except mwclient.errors.MwClientError as e: + print(e) + print("This mwclient version seems not to work for us. Exiting.") + sys.exit(1) + + +def handle_infinite_loop(allrevs_response: Dict, arv_params: Dict, config: Config, site: mwclient.Site) -> Dict: + """ + return new allrevs_response without arvprop=content|comment if the response is truncated + """ + + assert len(allrevs_response["query"]["allrevisions"]) == 0, \ + "We should have received no revisions if we are stuck in a infinite loop" + print("WARNING: API returned continue parameter that doesn't change, we might be stuck in a loop") + print(f"current continue parameter: {arv_params.get('arvcontinue')}") + print(f"API warnings: {allrevs_response.get('warnings', {})}") + + if "truncated" in allrevs_response.get("warnings",{}).get("result",{}).get("*",""): + # workaround for [truncated API response for "allrevisions" causes infinite loop ] + # (https://github.com/mediawiki-client-tools/mediawiki-scraper/issues/166) + print("Let's try to skip this revision and continue...") + _arv_params_temp = arv_params.copy() + # make sure response is small + _arv_params_temp['arvprop'] = _arv_params_temp['arvprop'].replace('|content', '').replace('|comment', '') + _arv_params_temp["arvlimit"] = 1 + + allrevs_response_new = site.api( + http_method=config.http_method, **_arv_params_temp + ) + assert len(allrevs_response_new["query"]["allrevisions"]) == 1, \ + "Couldn't get a single revision to skip the infinite loop" # arvlimit=1 + assert arv_params.get("arvcontinue", None) != allrevs_response_new.get("continue", {}).get("arvcontinue", None), \ + "??? Infinite loop is still there ???" + # success, let's continue + log_error(config=config, to_stdout=True, + text=f"ERROR: API returned continue parameter '{arv_params.get('arvcontinue')}' that doesn't change, " + f"skipped this revision to avoid infinite loop") + return allrevs_response_new + else: + raise NotImplementedError("Unable to solve the infinite loop automatically") diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py new file mode 100644 index 00000000..2c465ace --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py @@ -0,0 +1,123 @@ +from typing import Dict, Optional +import xml.etree.ElementTree as ET + +from lxml import etree +from lxml.builder import E + +from wikiteam3.dumpgenerator.exceptions import PageMissingError + +def make_xml_page_from_raw(xml: str, arvcontinue: Optional[str] = None) -> str: + """Discard the metadata around a element in string + + arvcontinue: None -> disable arvcontinue (default) + arvcontinue: string (including empty "") -> write arvcontinue to XML (for api:allrevisions resuming) + """ + tree: ET.Element = ET.XML(xml) + page: ET.Element | None = tree.find(".//{*}page") + + assert page is not None + + if arvcontinue is not None: + page.attrib['arvcontinue'] = arvcontinue + # remove namespace prefix + for elem in tree.iter(): + elem.tag = elem.tag.split('}', 1)[-1] + + return ET.tostring(page, encoding="unicode", method="xml", xml_declaration=False) + + +def make_xml_from_page(page: Dict, arvcontinue: Optional[str] = None) -> str: + """Output an XML document as a string from a page as in the API JSON + + arvcontinue: None -> disable arvcontinue (default) + arvcontinue: string (including empty "") -> write arvcontinue to XML (for api:allrevisions resuming) + """ + try: + p = E.page( + E.title(str(page["title"])), + E.ns(str(page["ns"])), + E.id(str(page["pageid"])), + ) + if arvcontinue is not None: + p.attrib['arvcontinue'] = arvcontinue + for rev in page["revisions"]: + # Older releases like MediaWiki 1.16 do not return all fields. + if "userid" in rev: + userid = rev["userid"] + else: + userid = 0 + if "size" in rev: + size = rev["size"] + else: + size = 0 + + # Create rev object + revision = [E.id(str(rev["revid"])), + E.timestamp(rev["timestamp"]),] + + # The text, user, comment, sha1 may be deleted/suppressed + if (('texthidden' in rev) or ('textmissing' in rev)) or ('*' not in rev): + print("Warning: text missing/hidden in pageid %d revid %d" % (page['pageid'], rev['revid'])) + revision.append(E.text(**{ + 'bytes': str(size), + 'deleted': 'deleted', + })) + else: + text = str(rev["*"]) + revision.append(E.text(text, **{ + 'bytes': str(size), + '{http://www.w3.org/XML/1998/namespace}space': 'preserve', + })) + + if "user" not in rev: + if "userhidden" not in rev: + print("Warning: user not hidden but missing user in pageid %d revid %d" % (page['pageid'], rev['revid'])) + revision.append(E.contributor(deleted="deleted")) + else: + revision.append( + E.contributor( + E.username(str(rev["user"])), + E.id(str(userid)), + ) + ) + + if "sha1" not in rev: + if "sha1hidden" in rev: + revision.append(E.sha1()) # stub + else: + # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia). + pass + elif "sha1" in rev: + revision.append(E.sha1(rev["sha1"])) + + + if 'commenthidden' in rev: + revision.append(E.comment(deleted="deleted")) + elif "comment" in rev and rev["comment"]: + revision.append(E.comment(str(rev["comment"]))) + + if "contentmodel" in rev: + revision.append(E.model(rev["contentmodel"])) + if "contentformat" in rev: + revision.append(E.format(rev["contentformat"])) + if "parentid" in rev and int(rev["parentid"]) > 0: + revision.append(E.parentid(str(rev["parentid"]))) + + if "minor" in rev: + revision.append(E.minor()) + + # mwcli's dump.xml order + revisionTags = ['id', 'parentid', 'timestamp', 'contributor', 'minor', 'comment', 'origin', 'model', 'format', 'text', 'sha1'] + revisionElementsDict = {elem.tag: elem for elem in revision} + _revision = E.revision() + for tag in revisionTags: + if tag in revisionElementsDict: + _revision.append(revisionElementsDict.pop(tag)) + for elem in revisionElementsDict.values(): + _revision.append(elem) + p.append(_revision) + except KeyError as e: + import traceback + traceback.print_exc() + raise PageMissingError(page["title"], e) + return etree.tostring(p, pretty_print=True, encoding="unicode") diff --git a/wikiteam3/dumpgenerator/dump/redirect/allredirects.py b/wikiteam3/dumpgenerator/dump/redirect/allredirects.py new file mode 100644 index 00000000..e2ac1a52 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/redirect/allredirects.py @@ -0,0 +1,60 @@ + +import requests + +from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI +from wikiteam3.dumpgenerator.cli.delay import Delay +from wikiteam3.dumpgenerator.config import Config +from wikiteam3.utils.util import ALL_NAMESPACE_FLAG + +def get_redirects_by_allredirects(config: Config, session: requests.Session): + continueKey = 'arcontinue' + assert config.api, "API URL is required" + + namespaces, namespacenames = getNamespacesAPI(config=config, session=session) + ar_params = { + "action": "query", + "format": "json", + "list": "allredirects", + "arlimit": config.api_chunksize, + "arprop": "ids|title|fragment|interwiki", + "ardir": "ascending", + "continue": "" # DEV.md#Continuation + } + for ns in namespaces: + if continueKey in ar_params: + del ar_params[continueKey] # reset continue parameter + if ALL_NAMESPACE_FLAG not in config.namespaces: # user has specified namespaces + if ns not in config.namespaces: + print(f"Skipping namespace {ns}") + continue + + print(f"Processing namespace {ns} ({namespacenames[ns] if ns in namespacenames else 'unknown'})") + ar_params["arnamespace"] = str(ns) + while True: + Delay(config=config) + r = session.get(url=config.api, params=ar_params) + allredirects_response = r.json() + + redirects = allredirects_response["query"]["allredirects"] + for redirect in redirects: + yield redirect + + if "continue" in allredirects_response: + # update continue parameter + ar_params[continueKey] = allredirects_response["continue"][continueKey] + print(f" {continueKey}={ar_params[continueKey]}") + else: + # End of continuation. We are done with this namespace. + break + +# TODO: unit test +if __name__ == "__main__": + config = Config( + api="https://en.wikipedia.org/w/api.php", + namespaces=[ALL_NAMESPACE_FLAG], # type: ignore + redirects=True, + api_chunksize=500 + ) + ss = requests.Session() + for redirect in get_redirects_by_allredirects(config, ss): + print(redirect) \ No newline at end of file diff --git a/wikiteam3/dumpgenerator/dump/redirect/redirects_dump.py b/wikiteam3/dumpgenerator/dump/redirect/redirects_dump.py new file mode 100644 index 00000000..c5361d49 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/redirect/redirects_dump.py @@ -0,0 +1,42 @@ + + +import json +import os +import requests +from wikiteam3.dumpgenerator.config import Config +from wikiteam3.dumpgenerator.dump.redirect.allredirects import get_redirects_by_allredirects +from wikiteam3.utils.identifier import url2prefix_from_config + + +def generate_redirects_dump(config: Config, resume=False, *, session: requests.Session): + tmp_filename = "{}-{}-redirects.tmp".format( + url2prefix_from_config(config=config), config.date + ) + redirects_filename = "{}-{}-redirects.jsonl".format( + url2prefix_from_config(config=config), config.date + ) + + if resume: + if os.path.exists("{}/{}".format(config.path, redirects_filename)): + print("redirects dump was completed in the previous session") + return + + print("Resuming is not supported yet, regenerating the redirects dump") + + + tmp_file = open( + "{}/{}".format(config.path, tmp_filename), "w", encoding="utf-8" + ) + for redirect in get_redirects_by_allredirects(config, session): + print(" ", redirect) + tmp_file.write( + json.dumps(redirect, ensure_ascii=False, separators=(",", ":")) + +"\n" + ) + tmp_file.close() + + + os.rename( + "{}/{}".format(config.path, tmp_filename), + "{}/{}".format(config.path, redirects_filename), + ) \ No newline at end of file diff --git a/wikiteam3/dumpgenerator/dump/xmldump/__init__.py b/wikiteam3/dumpgenerator/dump/xmldump/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py b/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py new file mode 100644 index 00000000..42bc0581 --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py @@ -0,0 +1,150 @@ +from io import TextIOWrapper +import re +import sys +from typing import Optional + +import lxml.etree +import requests + +from wikiteam3.dumpgenerator.cli import Delay +from wikiteam3.utils import url2prefix_from_config +from wikiteam3.dumpgenerator.exceptions import PageMissingError +from wikiteam3.dumpgenerator.log import log_error +from wikiteam3.dumpgenerator.api.page_titles import read_titles +from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import get_XML_page +from wikiteam3.dumpgenerator.config import Config +from wikiteam3.utils import clean_XML, undo_HTML_entities +from wikiteam3.dumpgenerator.dump.xmldump.xml_header import getXMLHeader +from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions import getXMLRevisions +from wikiteam3.dumpgenerator.dump.xmldump.xml_truncate import truncateXMLDump, parse_last_page_chunk + + +def doXMLRevisionDump(config: Config, session: requests.Session, xmlfile: TextIOWrapper, + lastPage: Optional[lxml.etree._ElementTree]=None, useAllrevisions: bool=False): + try: + r_timestamp = r"([^<]+)" + r_arvcontinue = r'' + + lastArvcontinue = None + for xml in getXMLRevisions(config=config, session=session, lastPage=lastPage, useAllrevision=useAllrevisions): + numrevs = len(re.findall(r_timestamp, xml)) + arvcontinueRe = re.findall(r_arvcontinue, xml) + if arvcontinueRe: + curArvcontinue = arvcontinueRe[0] + if lastArvcontinue != curArvcontinue: + Delay(config=config) + lastArvcontinue = curArvcontinue + # Due to how generators work, it's expected this may be less + xml = clean_XML(xml=xml) + xmlfile.write(xml) + + xmltitle = re.search(r"([^<]+)", xml) + assert xmltitle, f"Failed to find title in XML: {xml}" + title = undo_HTML_entities(text=xmltitle.group(1)) + print(f'{title}, {numrevs} edits') + # Delay(config=config) + except AttributeError as e: + print(e) + print("This API library version is not working") + sys.exit(1) + except UnicodeEncodeError as e: + print(e) + +def doXMLExportDump(config: Config, session: requests.Session, xmlfile: TextIOWrapper, lastPage=None): + print( + '\nRetrieving the XML for every page\n' + ) + + lock = True + start = None + if lastPage is not None: + try: + start = lastPage.find('title').text + except Exception: + print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage))) + raise + else: + # requested complete xml dump + lock = False + + c = 1 + for title in read_titles(config, session=session, start=start): + if not title: + continue + if title == start: # start downloading from start, included + lock = False + if lock: + continue + Delay(config=config) + if c % 10 == 0: + print(f"\n-> Downloaded {c} pages\n") + try: + for xml in get_XML_page(config=config, title=title, session=session): + xml = clean_XML(xml=xml) + xmlfile.write(xml) + except PageMissingError: + log_error( + config=config, to_stdout=True, + text='The page "%s" was missing in the wiki (probably deleted)' + % title, + ) + # here, XML is a correct chunk or + # an empty string due to a deleted page (logged in errors log) or + # an empty string due to an error while retrieving the page from server + # (logged in errors log) + c += 1 + + +def generate_XML_dump(config: Config, resume=False, *, session: requests.Session): + """Generates a XML dump for a list of titles or from revision IDs""" + + header, config = getXMLHeader(config=config, session=session) + footer = "\n" # new line at the end + xmlfilename = "{}-{}-{}.xml".format( + url2prefix_from_config(config=config), + config.date, + "current" if config.curonly else "history", + ) + xmlfile = None + + lastPage = None + lastPageChunk = None + # start != None, means we are resuming a XML dump + if resume: + print( + "Removing the last chunk of past XML dump: it is probably incomplete." + ) + # truncate XML dump if it already exists + lastPageChunk = truncateXMLDump(f"{config.path}/{xmlfilename}") + if not lastPageChunk.strip(): + print("Last page chunk is NULL, we'll directly start a new dump!") + resume = False + lastPage = None + else: + lastPage = parse_last_page_chunk(lastPageChunk) + if lastPage is None: + print("Failed to parse last page chunk: \n%s" % lastPageChunk) + print("Cannot resume, exiting now!") + sys.exit(1) + + print("WARNING: will try to start the download...") + xmlfile = open( + "{}/{}".format(config.path, xmlfilename), "a", encoding="utf-8" + ) + else: + print("\nRetrieving the XML for every page from the beginning\n") + xmlfile = open( + "{}/{}".format(config.path, xmlfilename), "w", encoding="utf-8" + ) + xmlfile.write(header) + + if config.xmlrevisions and not config.xmlrevisions_page: + doXMLRevisionDump(config, session, xmlfile, lastPage, useAllrevisions=True) + elif config.xmlrevisions and config.xmlrevisions_page: + doXMLRevisionDump(config, session, xmlfile, lastPage, useAllrevisions=False) + else: # --xml + doXMLExportDump(config, session, xmlfile, lastPage) + xmlfile.write(footer) + xmlfile.close() + print("XML dump saved at...", xmlfilename) + return xmlfilename diff --git a/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py b/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py new file mode 100644 index 00000000..8836586f --- /dev/null +++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py @@ -0,0 +1,131 @@ +import json +import re +import sys +from typing import Tuple + +import requests + +from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingError +from wikiteam3.dumpgenerator.log import log_error +from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import get_XML_page +from wikiteam3.dumpgenerator.config import Config + +def getXMLHeader(config: Config, session: requests.Session) -> Tuple[str, Config]: + """Retrieve a random page to extract XML headers (namespace info, etc)""" + # get the header of a random page, to attach it in the complete XML backup + # similar to: does not exist. Not a problem, if we get the . + xml = pme.xml + # Issue 26: Account for missing "Special" namespace. + # Hope the canonical special name has not been removed. + # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases + except ExportAbortedError: + try: + if config.api: + print("Trying the local name for the Special namespace instead") + r = session.get( + url=config.api, + params={ + "action": "query", + "meta": "siteinfo", + "siprop": "namespaces", + "format": "json", + }, + timeout=120, + ) + config.export = ( + json.loads(r.text)["query"]["namespaces"]["-1"]["*"] + ":Export" + ) + xml = "".join( + [ + x + for x in get_XML_page( + config=config, + title=randomtitle, + verbose=False, + session=session, + ) + ] + ) + except PageMissingError as pme: + xml = pme.xml + except ExportAbortedError: + pass + + header = xml.split("")[0] + if not re.match(r"\s* int: + """Returns the number of newlines at the end of file""" + + with FileReadBackwards(filename, encoding="utf-8") as frb: + newlines = 0 + while frb.readline() == "": + newlines += 1 + return newlines + + +def addNewline(filename: str) -> None: + """Adds a newline to the end of file""" + + print(f"Adding newline to end of {filename}") + with open(filename, "a", encoding="utf-8") as f: + f.write("\n") + + +def truncateXMLDump(filename: str, dryrun: bool = False) -> str: + """ + Removes incomplete elements from the end of XML dump files + + dryrun: bool - returns the incomplete segment without truncating the file + """ + + with FileReadBackwards(filename, encoding="utf-8") as frb: + incomplete_segment: str = "" + xml_line: str = frb.readline() + while xml_line and "" not in xml_line: + incomplete_segment = xml_line + incomplete_segment + xml_line = frb.readline() + while xml_line and "" not in xml_line: + incomplete_segment = xml_line + incomplete_segment + xml_line = frb.readline() + if dryrun: + return incomplete_segment + incomplete_segment_size = len(incomplete_segment.encode("utf-8")) + file_size = os.path.getsize(filename) + if file_size > incomplete_segment_size: + with open(filename, "r+", encoding="utf-8") as fh: + fh.truncate(file_size - incomplete_segment_size) + else: + print( + 'len(incomplete_segment.encode("utf-8")) returned ' + + str(incomplete_segment_size) + + ", while os.path.getsize(filename) returned " + + str(file_size) + + ", so fh.truncate() would be fh.truncate(" + + str(file_size - incomplete_segment_size) + + "), which would be illegal. Something is seriously wrong here!" + ) + + # add newline to prevent ` ` in one line + if endsWithNewlines(filename) == 0: + addNewline(filename) + elif endsWithNewlines(filename) > 1: + print( + f"WARNING: {filename} has {endsWithNewlines(filename)} newlines" + ) + return incomplete_segment + +def parse_last_page_chunk(chunk: str) -> Optional[lxml.etree._ElementTree]: + try: + parser = lxml.etree.XMLParser(recover=True) + tree: lxml.etree._ElementTree = lxml.etree.parse(StringIO(chunk), parser) + return tree.getroot() + except lxml.etree.LxmlError: + print("Failed to parse last page chunk") + return None \ No newline at end of file diff --git a/wikiteam3/dumpgenerator/exceptions.py b/wikiteam3/dumpgenerator/exceptions.py new file mode 100644 index 00000000..d7cef602 --- /dev/null +++ b/wikiteam3/dumpgenerator/exceptions.py @@ -0,0 +1,57 @@ +from typing import Optional + + +class InternalApiError(Exception): + """ base class for all internal API errors """ + error_code = "internal_api_error_*" + errorclass = "MW*Exception" + common_cause = "reason a; reason b; reason c" + samples = ["url"] + + +class MWUnknownContentModelException(InternalApiError): + error_code = "internal_api_error_MWUnknownContentModelException" + errorclass = "MWUnknownContentModelException" + common_cause = "The content model xxxxx is not registered on this wiki; Some extensions use special content models for their own purposes, but they did not register a handler to export their content (?)" + samples = [ + "https://web.archive.org/web/20231015082428id_/https://www.wikidoc.org/api.php?titles=Talk%3AMain_Page&action=query&format=xml&prop=revisions&rvprop=timestamp|user|comment|content|ids|flags|size|userid|sha1|contentmodel&rvlimit=50", + "https://web.archive.org/web/20231015082600id_/https://www.wikidoc.org/api.php?titles=Talk%3AMain_Page&action=query&format=json&prop=revisions&rvprop=timestamp|user|comment|content|ids|flags|size|userid|sha1|contentmodel&rvlimit=50" + ] + + +class PageMissingError(Exception): + def __init__(self, title, xml): + self.title = title + self.xml = xml + + def __str__(self): + return "page '%s' not found" % self.title + + +class ExportAbortedError(Exception): + def __init__(self, index): + self.index = index + + def __str__(self): + return "Export from '%s' did not return anything." % self.index + + +class FileSizeError(Exception): + def __init__(self, file: str, got_size: int, excpected_size: int, online_url: Optional[str] = None): + self.file = file + self.got_size = got_size + self.excpected_size = excpected_size + self.online_url = online_url + + def __str__(self): + return f"File '{self.file}' size {self.got_size} is not match '{self.excpected_size}'." \ + + (f"(url: {self.online_url})" if self.online_url else "") + + +class FileSha1Error(Exception): + def __init__(self, file, excpected_sha1): + self.file = file + self.excpected_sha1 = excpected_sha1 + + def __str__(self): + return f"File '{self.file}' sha1 is not match '{self.excpected_sha1}'." diff --git a/wikiteam3/dumpgenerator/log/__init__.py b/wikiteam3/dumpgenerator/log/__init__.py new file mode 100644 index 00000000..d303a511 --- /dev/null +++ b/wikiteam3/dumpgenerator/log/__init__.py @@ -0,0 +1 @@ +from .log_error import log_error diff --git a/wikiteam3/dumpgenerator/log/log_error.py b/wikiteam3/dumpgenerator/log/log_error.py new file mode 100644 index 00000000..179d064f --- /dev/null +++ b/wikiteam3/dumpgenerator/log/log_error.py @@ -0,0 +1,15 @@ +import datetime + +from wikiteam3.dumpgenerator.config import Config + +def log_error(config: Config, to_stdout=False , text="") -> None: + """Log error in errors.log""" + if text: + with open(f"{config.path}/errors.log", "a", encoding="utf-8") as outfile: + output = "{}: {}\n".format( + datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), + text, + ) + outfile.write(output) + if to_stdout: + print(text) diff --git a/wikiteam3/dumpgenerator/version.py b/wikiteam3/dumpgenerator/version.py new file mode 100644 index 00000000..ec44dc8c --- /dev/null +++ b/wikiteam3/dumpgenerator/version.py @@ -0,0 +1,11 @@ +__VERSION__ = "unknown" + +try: + import importlib.metadata + __VERSION__ = importlib.metadata.version("wikiteam3") +except Exception: + pass + + +def getVersion(): + return __VERSION__ diff --git a/wikiteam3/tools/get_arvcontinue.py b/wikiteam3/tools/get_arvcontinue.py new file mode 100644 index 00000000..77b7f42e --- /dev/null +++ b/wikiteam3/tools/get_arvcontinue.py @@ -0,0 +1,21 @@ +import argparse + +from wikiteam3.dumpgenerator.dump.xmldump.xml_truncate import parse_last_page_chunk, truncateXMLDump + +def parse_args(): + parser = argparse.ArgumentParser(description="Get the next arvcontinue value") + parser.add_argument("xml", help="XML file") + args = parser.parse_args() + return args + +def main(): + args = parse_args() + xmlfile: str = args.xml + lastPageChunk = truncateXMLDump(xmlfile, dryrun=True) + lastPage = parse_last_page_chunk(lastPageChunk) + assert lastPage is not None + lastArvcontinue = lastPage.attrib['arvcontinue'] + print(f'ARVCONTINUE="{lastArvcontinue}"') + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/wikiteam3/uploader/__init__.py b/wikiteam3/uploader/__init__.py new file mode 100644 index 00000000..223e904f --- /dev/null +++ b/wikiteam3/uploader/__init__.py @@ -0,0 +1,4 @@ +from wikiteam3.uploader.uploader import main + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/wikiteam3/uploader/__main__.py b/wikiteam3/uploader/__main__.py new file mode 100644 index 00000000..223e904f --- /dev/null +++ b/wikiteam3/uploader/__main__.py @@ -0,0 +1,4 @@ +from wikiteam3.uploader.uploader import main + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/wikiteam3/uploader/compresser.py b/wikiteam3/uploader/compresser.py new file mode 100644 index 00000000..4c892d03 --- /dev/null +++ b/wikiteam3/uploader/compresser.py @@ -0,0 +1,225 @@ + +import os +from pathlib import Path +import subprocess +import sys +import time +from typing import Tuple, Union +import warnings + + +class ZstdCompressor: + DEFAULT_LEVEL = 17 + MIN_VERSION = (1, 4, 8) + bin_zstd = "zstd" + + # additional options + rezstd: bool = False + rezstd_endpoint: str = "http://pool-rezstd.saveweb.org/rezstd/" + + def __init__(self, bin_zstd: str = "zstd", + rezstd: bool = False, rezstd_endpoint: str = "http://pool-rezstd.saveweb.org/rezstd/"): + """ + bin_zstd: path to zstd binary + rezstd: upload zstd pre-compressed file to rezstd server for recompression with "best" (-22 --ultra --long=31) configuration. + rezstd_endpoint: the endpoint of rezstd server + """ + self.bin_zstd = bin_zstd + version = self.versionNumber() + assert version >= self.MIN_VERSION, f"zstd version must be >= {self.MIN_VERSION}" + # if v1.5.0-v1.5.4 + if (1, 5, 0) <= version <= (1, 5, 4): + warnings.warn("your zstd version is between 1.5.0 and 1.5.4, which is not recommended due to a rare corruption bug in high compression mode, PLEASE UPGRADE TO 1.5.5+") + sys.exit(1) + + self.rezstd = rezstd + self.rezstd_endpoint = rezstd_endpoint + + def versionNumber(self) -> Tuple[int, int, int]: + """ + Return runtime library version, the value is (`MAJOR`, `MINOR`, `RELEASE`). + """ + rettext = subprocess.check_output([self.bin_zstd, "-q", "-V"], shell=False).decode().strip() + # 1.5.5 + ret_versions = [int(x) for x in rettext.split(".")] + assert len(ret_versions) == 3 + return tuple(ret_versions) # type: ignore + + def compress_file(self, path: Union[str, Path], *, level: int = DEFAULT_LEVEL, long_level: int = 31) -> Path: + ''' Compress path into path.zst and return the absolute path to the compressed file. + + we set -T0 to use all cores + + level: + - 1 -> fast + - ... + - 19 -> high + - ... (ultra mode) + - 22 -> best + long_level: + - 31 -> 2^31 (2GB) window size (default) + - 30 -> 2^30 (1GB) + - ... + - 0 -> Disable --long flag + ''' + if isinstance(path, str): + path = Path(path) + path = path.resolve() # remove trailing slash + + compressed_path = path.parent / (path.name + ".zst") # path + ".zst" + compressed_path = compressed_path.resolve() + + compressing_temp_path = path.parent / (path.name + ".zst.tmp") # path + ".zst.tmp" + compressing_temp_path = compressing_temp_path.resolve() + + if compressed_path.exists(): + print(f"File {compressed_path} already exists. Skip compressing.") + return compressed_path + + cmd = [self.bin_zstd, "-T0","-v", "--compress", "--force"] + if level >= 20: + cmd.append("--ultra") + if long_level: + cmd.append(f"--long={long_level}") + cmd.extend([f"-{level}", str(path), "-o", str(compressing_temp_path)]) + + subprocess.run(cmd) + assert compressing_temp_path.exists() + if self.rezstd: + pre_compressing_temp_path = compressing_temp_path # alias + + compressing_rezstded_temp_path = path.parent / (path.name + ".rezstded.tmp") + compressing_rezstded_temp_path = compressing_rezstded_temp_path.resolve() + + assert self.rezstd_endpoint.endswith("/") + import requests + session = requests.Session() + # upload to rezstd + print("Creating rezstd task...") + # TODO: reuse previous task_id + r = session.post(self.rezstd_endpoint + 'create/chunked') + print(r.text) + task_id = r.json()["task_id"] + # upload chunks + total_size = pre_compressing_temp_path.stat().st_size + chunk_size = 1024 * 1024 * 50 # 50MB + with open(pre_compressing_temp_path, "rb") as f: + # /rezstd/upload/chunked/:task_id/:chunk_id + upload_bytes = 0 + chunk_id = 0 + while chunk := f.read(chunk_size): + # TODO: parrallel upload + r = session.put(self.rezstd_endpoint + f"upload/chunked/{task_id}/{chunk_id}", files={"chunk": chunk}) + assert "error" not in r.json() + upload_bytes += len(chunk) + print(f"Uploaded {upload_bytes/1024/1024:.2f}/{total_size/1024/1024:.2f} MB", end="\r") + chunk_id += 1 + print() + # r.POST("/rezstd/concat/chunked/:task_id/:max_chunk_id/:total_size" + print("Concatenating chunks...") + max_chunk_id = chunk_id - 1 or 0 + r = session.post(self.rezstd_endpoint + f"concat/chunked/{task_id}/{max_chunk_id}/{total_size}") + print(r.text) + assert "error" not in r.json() + + os.remove(pre_compressing_temp_path) + finished = False + while not finished: + r = session.get(self.rezstd_endpoint + f"status/{task_id}") + print(r.text, end="\r") + assert "error" not in r.json() + if r.json()["status"] == "finished": + finished = True + break + time.sleep(5) + print("Server side recompression finished, log:", + f"{self.rezstd_endpoint}log/{task_id}", + "(only available for a few days)") + r = session.get(self.rezstd_endpoint + f"download/{task_id}/wikiteam3_task.zst", stream=True) + content_length = int(r.headers["Content-Length"]) + with open(compressing_rezstded_temp_path, "wb") as f: + written = 0 + last_report_time = time.time() + for chunk in r.iter_content(chunk_size=1024 * 1024): + if time.time() - last_report_time > 10: + print(f"Downloaded {written/1024/1024:.2f}/{content_length/1024/1024:.2f} MB", end="\r") + last_report_time = time.time() + f.write(chunk) + written += len(chunk) + print() + # print("Download finished, deleting from server...") + # r = session.delete(self.rezstd_endpoint + f"delete/{task_id}") + # print(r.text) + # assert "error" not in r.json() + os.rename(compressing_rezstded_temp_path, compressing_temp_path) + + # move tmp file to final file + os.rename(compressing_temp_path, compressed_path) + return compressed_path + + def test_integrity(self, path: Union[str, Path]) -> bool: + ''' Test if path is a valid zstd compressed file. ''' + if isinstance(path, str): + path = Path(path) + path = path.resolve() + r = subprocess.run([self.bin_zstd,"-vv", "-d", "-t", "--long=31", str(path)]) + return r.returncode == 0 + +class SevenZipCompressor: + bin_7z = "7z" + def __init__(self, bin_7z: str = "7z"): + self.bin_7z = bin_7z + retcode = subprocess.call([self.bin_7z, "-h"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + if retcode: + raise FileNotFoundError(f"7z binary not found at {self.bin_7z}") + + def compress_dir(self, dir_path: Union[str, Path], level: int = 0): + ''' Compress dir_path into dump_dir.7z and return the resolved path to the compressed file. + + level: + - 0 -> only pack, no compression + - 1 -> fast + - ... + - 9 -> ultra + ''' + if isinstance(dir_path, str): + dir_path = Path(dir_path) + dir_path = dir_path.resolve() # remove trailing slash + + archive_path = dir_path.parent / (dir_path.name + ".7z") # dir_path + ".7z" + archive_path = archive_path.resolve() + + archive_temp_path = dir_path.parent / (dir_path.name + ".7z.tmp") # dir_path + ".7z.tmp" + archive_temp_path = archive_temp_path.resolve() + + if archive_path.exists(): + print(f"File {archive_path} already exists. Skip compressing.") + return archive_path + + if level: + cmds = [self.bin_7z, "a", "-t7z", "-m0=lzma2", f"-mx={level}", "-scsUTF-8", + "-md=64m", "-ms=off"] + else: # level == 0 + assert level == 0 + cmds = [self.bin_7z, "a", "-t7z", f"-mx={level}", "-scsUTF-8", "-ms=off"] + cmds.extend([str(archive_temp_path), str(dir_path)]) + + r = subprocess.run(cmds, check=True) + + assert archive_temp_path.exists() + # move tmp file to final file + os.rename(archive_temp_path, archive_path) + assert archive_path == archive_path.resolve() + return archive_path + + def test_integrity(self, path: Union[str, Path]) -> bool: + ''' Test if path is a valid 7z archive. ''' + if isinstance(path, str): + path = Path(path) + path = path.resolve() + r = subprocess.run([self.bin_7z, "t", str(path)]) + return r.returncode == 0 + +if __name__ == "__main__": + ZstdCompressor() + SevenZipCompressor() \ No newline at end of file diff --git a/wikiteam3/uploader/socketLock.py b/wikiteam3/uploader/socketLock.py new file mode 100644 index 00000000..881d56fa --- /dev/null +++ b/wikiteam3/uploader/socketLock.py @@ -0,0 +1,64 @@ + +import itertools +import socket +import time + + +class SocketLockServer: + """A server that binds to a port and holds it until released.""" + HOST, PORT = "localhost", 62954 + + def __init__(self): + self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + + def try_bind(self, strict=False): + try: + self._socket.bind((self.HOST, self.PORT)) + except OSError: + # Port is in use + if strict: + raise + return False + + print(f"SocketServer: Listening on {self.HOST}:{self.PORT}") + return True + + def is_port_in_use(self): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex((self.HOST, self.PORT)) == 0 + + + def bind_until_port_is_free(self): + dots = ["/", "-", "\\", "|"] + for dot in itertools.cycle(dots): + if not self.is_port_in_use(): + if self.try_bind(): + return True + time.sleep(0.5) + print(f"{self.HOST}:{self.PORT} is in use, waiting {dot}", end="\r") + + def release(self): + if not self._socket._closed: # type: ignore + self._socket.close() + print(f"SocketServer: Released {self.HOST}:{self.PORT}") + return True + print(f"SocketServer: No need to release {self.HOST}:{self.PORT}") + return None + + def __enter__(self): + self.bind_until_port_is_free() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return None + + +class NoLock: + def __init__(self): + pass + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return None diff --git a/wikiteam3/uploader/uploader.py b/wikiteam3/uploader/uploader.py new file mode 100644 index 00000000..7b7aeccf --- /dev/null +++ b/wikiteam3/uploader/uploader.py @@ -0,0 +1,582 @@ +import argparse +from datetime import datetime +import json +import os +import random +import re +import shutil +from dataclasses import dataclass +import sys +import time +import traceback +from typing import Dict, List, Optional, Tuple, Union +import urllib.parse +from io import BytesIO +from pathlib import Path + +import requests +from internetarchive import get_item, Item +from file_read_backwards import FileReadBackwards + +from wikiteam3.dumpgenerator.api.page_titles import checkTitleOk +from wikiteam3.dumpgenerator.config import Config, load_config +from wikiteam3.dumpgenerator.version import getVersion +from wikiteam3.uploader.socketLock import NoLock, SocketLockServer +from wikiteam3.utils import url2prefix_from_config, sha1sum +from wikiteam3.uploader.compresser import ZstdCompressor, SevenZipCompressor +from wikiteam3.utils.ia_checker import ia_s3_tasks_load_avg +from wikiteam3.utils.util import ALL_DUMPED_MARK, UPLOADED_MARK, XMLRIVISIONS_INCREMENTAL_DUMP_MARK, is_empty_dir, mark_as_done, is_markfile_exists + +DEFAULT_COLLECTION = 'opensource' +IDENTIFIER_PREFIX = "wiki-" + +@dataclass +class IAKeys: + access: str + secret: str + +@dataclass +class Args: + keys_file: Path + collection: str + dry_run: bool + update: bool + wikidump_dir: Path + + bin_zstd: str + zstd_level: int + bin_7z: str + parallel: bool + + rezstd: bool + rezstd_endpoint: str + + def __post_init__(self): + self.keys_file = Path(self.keys_file).expanduser().resolve() + if not self.keys_file.exists(): + raise FileNotFoundError(f"Keys file {self.keys_file} does not exist") + self.wikidump_dir = Path(self.wikidump_dir).expanduser().resolve() + if not self.wikidump_dir.exists(): + raise FileNotFoundError(f"wikidump_dir {self.wikidump_dir} does not exist") + + +def read_ia_keys(path: Path) -> IAKeys: + with open(path.expanduser().resolve()) as f: + lines = f.readlines() + + access = lines[0].strip() + secret = lines[1].strip() + + return IAKeys(access, secret) + + +def config2basename(config: Config) -> str: + basename = "{}-{}".format( + url2prefix_from_config(config=config), + config.date, + ) + return basename + + +def xmldump_is_complete(xml_path: Union[str, Path]) -> bool: + lines_left = 100 + with FileReadBackwards(xml_path, encoding="utf-8") as frb: + for l in frb: + if l.strip() == "": + # xml dump is complete + return True + + lines_left -= 1 + if lines_left <= 0: + return False + + return False + + +def images_list_is_complete(images_txt_path: Union[str, Path]) -> bool: + lines_left = 3 + with FileReadBackwards(images_txt_path, encoding="utf-8") as frb: + for l in frb: + if l.strip() == "--END--": + # images list is complete + return True + + lines_left -= 1 + if lines_left <= 0: + return False + + return False + +def get_xml_filename(config: Config) -> str: + xml_filename = "{}-{}.xml".format( + config2basename(config), + "current" if config.curonly else "history", + ) + return xml_filename + + +def prepare_xml_zst_file(wikidump_dir: Path, config: Config, *, parallel: bool, + zstd_compressor: ZstdCompressor, zstd_level: int + ) -> Path: + """ Compress xml file to .zst file.""" + xml_filename = get_xml_filename(config) + + xml_file_path = wikidump_dir / xml_filename + xml_zstd_file_path = wikidump_dir / f"{xml_filename}.zst" + + assert xml_file_path.exists() or xml_zstd_file_path.exists() + + if xml_file_path.exists(): + assert xmldump_is_complete(xml_file_path) + with NoLock() if parallel else SocketLockServer(): + # ensure only one process is compressing, to avoid OOM + r = zstd_compressor.compress_file(xml_file_path, level=zstd_level) + assert r == xml_zstd_file_path.resolve() + assert xml_zstd_file_path.exists() + assert zstd_compressor.test_integrity(r) + + # rm source xml file + # decompressing is so fast that we don't need to keep the xml file + # os.remove(xml_file_path) + + assert xml_zstd_file_path.exists() + + return xml_zstd_file_path.resolve() + + +def prepare_images_7z_archive(wikidump_dir: Path, config: Config, parallel: bool, *, + images_source: str = "images", + sevenzip_compressor: SevenZipCompressor) -> Optional[Path]: + """ Compress wikidump_dir/images_source dir to .7z file. + + return: + Path: to the .7z archive + None: the dir is empty. + """ + images_dir = wikidump_dir / images_source + assert images_source in ["images", "images_mismatch"] + assert images_dir.exists() and images_dir.is_dir() + + if is_empty_dir(images_dir): + return None + + images_7z_archive_path = wikidump_dir / f"{config2basename(config)}-{images_source}.7z" + if not images_7z_archive_path.exists() or not images_7z_archive_path.is_file(): + with NoLock() if parallel else SocketLockServer(): + r = sevenzip_compressor.compress_dir(images_dir) + shutil.move(r, images_7z_archive_path) + + assert sevenzip_compressor.test_integrity(images_7z_archive_path) + + assert images_7z_archive_path.exists() and images_7z_archive_path.is_file() + return images_7z_archive_path.resolve() + + +def prepare_files_to_upload(wikidump_dir: Path, config: Config, item: Item, *, parallel: bool, + zstd_compressor: ZstdCompressor, zstd_level: int, + sevenzip_compressor: SevenZipCompressor + ) -> Dict[str, str]: + """ return: filedict ("remote filename": "local filename") """ + filedict = {} # "remote filename": "local filename" + + # config.json + config_json_path = wikidump_dir / "config.json" + assert config_json_path.exists() + filedict[f"{config2basename(config)}-dumpMeta/config.json"] = str(config_json_path) + + # errors.log optional + if (wikidump_dir / "errors.log").exists(): + filedict[f"{config2basename(config)}-dumpMeta/errors.log"] = str(wikidump_dir / "errors.log") + # SpecialVersion.html optional + if (wikidump_dir / "SpecialVersion.html").exists(): + filedict[f"{config2basename(config)}-dumpMeta/SpecialVersion.html"] = str(wikidump_dir / "SpecialVersion.html") + # siteinfo.json optional + if (wikidump_dir / "siteinfo.json").exists(): + filedict[f"{config2basename(config)}-dumpMeta/siteinfo.json"] = str(wikidump_dir / "siteinfo.json") + # index.html optional + if (wikidump_dir / "index.html").exists(): + filedict[f"{config2basename(config)}-dumpMeta/index.html"] = str(wikidump_dir / "index.html") + + print("=== commpressing necessary files: ===") + + # .xml dump + if config.xml: + if not config.xmlrevisions: + # -titles.txt + titles_txt_path = wikidump_dir / f"{config2basename(config)}-titles.txt" + titles_txt_zstd_path = wikidump_dir / f"{config2basename(config)}-titles.txt.zst" + assert titles_txt_path.exists() + assert checkTitleOk(config) + r = zstd_compressor.compress_file(titles_txt_path,level=zstd_level) + assert r == titles_txt_zstd_path.resolve() + assert zstd_compressor.test_integrity(r) + filedict[f"{config2basename(config)}-dumpMeta/{titles_txt_zstd_path.name}"] = str(titles_txt_zstd_path) + xml_zstd_path = prepare_xml_zst_file(wikidump_dir, config, parallel=parallel, zstd_compressor=zstd_compressor, zstd_level=zstd_level) + filedict[f"{xml_zstd_path.name}"] = str(xml_zstd_path) + + # redirects + if config.redirects: + # osm.bio-20241204-redirects.jsonl + redirects_jsonl_path = wikidump_dir / f"{config2basename(config)}-redirects.jsonl" + redirects_jsonl_zstd_path = wikidump_dir / f"{config2basename(config)}-redirects.jsonl.zst" + assert redirects_jsonl_path.exists() + # TODO: check if redirects dump is complete + r = zstd_compressor.compress_file(redirects_jsonl_path, level=zstd_level) + assert r == redirects_jsonl_zstd_path.resolve() + assert zstd_compressor.test_integrity(r) + filedict[f"{config2basename(config)}-dumpMeta/{redirects_jsonl_zstd_path.name}"] = str(redirects_jsonl_zstd_path) + + # images + if config.images: + # images.txt + images_txt_path = wikidump_dir / f"{config2basename(config)}-images.txt" + images_txt_zstd_path = wikidump_dir / f"{config2basename(config)}-images.txt.zst" + assert images_list_is_complete(images_txt_path) + r = zstd_compressor.compress_file(images_txt_path, level=zstd_level) + assert r == images_txt_zstd_path.resolve() + assert zstd_compressor.test_integrity(r) + filedict[f"{config2basename(config)}-dumpMeta/{images_txt_zstd_path.name}"] = str(images_txt_zstd_path) + + # images.7z and images_mismatch.7z + for images_source in ["images", "images_mismatch"]: + # <--- TODO: remove this block in v4.2.1 + if images_source == "images_mismatch" and not (wikidump_dir / images_source).exists(): + print(f"{images_source} dir not found, skip") + continue + # ---> + images_7z_archive_path = prepare_images_7z_archive(wikidump_dir, config, parallel, images_source=images_source, sevenzip_compressor=sevenzip_compressor) + if images_7z_archive_path: + filedict[f"{images_7z_archive_path.name}"] = str(images_7z_archive_path) + else: + print(f"{images_source} dir is empty, skip creating .7z archive") + + print("=== Files already uploaded: ===") + c = 0 + for file_in_item in item.files: + if file_in_item["name"] in filedict: + c += 1 + if int(file_in_item["size"]) != os.path.getsize(filedict[file_in_item["name"]]): + print(f' "{file_in_item["name"]}" (size mismatch), will re-upload') + continue + + filedict.pop(file_in_item["name"]) + print(f' "{file_in_item["name"]}" (already uploaded)') + print(f"Already uploaded {c} files. ({len(item.files)} files in remote item in total)") + + print("=== Files to upload: ===") + for remote_dest, local_src in filedict.items(): + print(f' "{remote_dest}" from "{local_src}"') + print(f"{len(filedict)} files ready to upload...") + + return filedict + +def prepare_item_metadata(wikidump_dir: Path, config: Config, arg: Args) -> Tuple[Dict, Optional[str]]: + """ return: (IA item metadata dict, logo_url) """ + + wiki_prefix: str = url2prefix_from_config(config=config, ascii_slugify=False) # e.g. wiki.example.org + + sitename: Optional[str] = None # or empty str + rights_text: Optional[str] = None # or empty str + rights_url: Optional[str] = None # or empty str + lang: Optional[str] = None # or empty str + base_url: Optional[str] = None # or empty str + logo_url: Optional[str] = None # or empty str + if (wikidump_dir / "siteinfo.json").exists(): + with open(wikidump_dir / "siteinfo.json", "r", encoding="utf-8") as f: + siteinfo: Dict = json.load(f) + + general = siteinfo.get("query", {}).get("general", {}) + rightsinfo = siteinfo.get("query", {}).get("rightsinfo", {}) + + + sitename = general.get("sitename", None) + assert isinstance(sitename, str) or sitename is None + + base_url = general.get("base", None) + assert isinstance(base_url, str) or base_url is None + if base_url: + if base_url.startswith("//"): + print(f"WARNING: base_url {base_url} starts with // (protocol-relative URLs), will convert to https://") + # Convert protocol-relative URLs + base_url = re.sub(r"^//", r"https://", base_url) + + logo_url = general.get("logo", None) + + lang = general.get("lang", None) + assert isinstance(lang, str) or lang is None + + rights_text = rightsinfo.get("text", None) + assert isinstance(rights_text, str) or rights_text is None + + rights_url = rightsinfo.get("url", None) + if rights_url and "www.fandom.com" in rights_url and "/licensing" in rights_url: + # Link the default license directly instead + rights_url = "https://creativecommons.org/licenses/by-sa/3.0/" + + assert isinstance(rights_url, str) or rights_url is None + + if config.xml: + xml_file_path = wikidump_dir / get_xml_filename(config) + assert xml_file_path.exists() + with open(xml_file_path, "rb") as f: + xmlheader = f.read(1024 * 1024) # 1MiB + # get sitename from xmlheader + if not sitename and b"" in xmlheader: + sitename = xmlheader.split(b"", 1)[1].split(b"", 1)[0].decode("utf-8") + if not base_url and b"" in xmlheader: + base_url = xmlheader.split(b"", 1)[1].split(b"", 1)[0].decode("utf-8") + + if not base_url: + base_url = re.sub(r"(?im)/api\.php", r"", config.api or config.index) + + keywords = [ + "wiki", + "wikiteam", + "wikiteam3", + "MediaWiki", + wiki_prefix, + ] + if sitename: + keywords.append(sitename) + if not rights_url and not rights_text: + keywords.append("unknowncopyright") + + licenseurl: Optional[str] = urllib.parse.urljoin(config.api or config.index, rights_url) if rights_url else None + description = f'{sitename or wiki_prefix} dumped with wikiteam3 tools.' + + metadata = { + "mediatype": "web", + "collection": arg.collection, + "title": "Wiki - " + (sitename or wiki_prefix), + "description": description, # without URL, to bypass IA's anti-spam. + "language": lang, + "last-updated-date": time.strftime("%Y-%m-%d", time.gmtime()), + "subject": "; ".join( + keywords + ), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ... + "licenseurl": licenseurl or None, + "rights": rights_text or None, + "originalurl": config.api or config.index, + "upload-state": "uploading", + "scanner": f"wikiteam3 (v{getVersion()})", + } + print("=== Item metadata: ===") + print(json.dumps(metadata, indent=4, sort_keys=True, ensure_ascii=False)) + print(f"logo_url: {logo_url}") + + return metadata, logo_url + +def upload(arg: Args): + zstd_compressor = ZstdCompressor(bin_zstd=arg.bin_zstd, rezstd=arg.rezstd, rezstd_endpoint=arg.rezstd_endpoint) + sevenzip_compressor = SevenZipCompressor(bin_7z=arg.bin_7z) + ia_keys = read_ia_keys(arg.keys_file) + wikidump_dir = arg.wikidump_dir + wikidump_dir.name # {prefix}-{wikidump_dumpdate}-wikidump (e.g. wiki.example.org-20230730-wikidump) + assert wikidump_dir.name.endswith("-wikidump"), f"Expected wikidump_dir to end with -wikidump, got {wikidump_dir.name}" + + print(f"=== Loading config from {wikidump_dir} ===") + + # load config + init_config = Config() + init_config.path = str(wikidump_dir) + config = load_config(config_filename="config.json", config=init_config) + + config.path = str(wikidump_dir) # override path + + print(config) + + assert wikidump_dir == Path(config.path).resolve() + + assert is_markfile_exists(config, ALL_DUMPED_MARK), "Imcomplete dump" + assert not is_markfile_exists(config, XMLRIVISIONS_INCREMENTAL_DUMP_MARK), "xmlrevisions incremental dump is not supported yet" + if is_markfile_exists(config, UPLOADED_MARK): + print(f"Already uploaded to IA ({UPLOADED_MARK} exists), bye!") + return + + wikidump_dumpdate = wikidump_dir.name.split("-")[-2] + assert config.date == wikidump_dumpdate + if (not wikidump_dumpdate.isdigit()) or (not 20230730 < int(wikidump_dumpdate) < 9999_99_99): + raise ValueError(f"Expected wikidump_dumpdate to be an 8-digit number, got {wikidump_dumpdate}") + try: + datetime.strptime(wikidump_dumpdate, "%Y%m%d") + except ValueError as e: + raise ValueError(f"Expected wikidump_dumpdate to be a valid date, got {wikidump_dumpdate}") from e + + # NOTE: Punycoded domain may contain multiple `-` + # e.g. `xn--6qq79v.xn--rhqv96g-20230730-wikidump` (你好.世界_美丽-20230730-wikidump) + _identifier = IDENTIFIER_PREFIX + url2prefix_from_config(config=config) + "-" + config.date + identifier = IDENTIFIER_PREFIX + wikidump_dir.name.rstrip("-wikidump") + assert identifier == _identifier + + item = get_item(identifier) + + print("=== Preparing files to upload ===") + filedict = prepare_files_to_upload( + wikidump_dir, config, item, parallel=arg.parallel, + zstd_compressor=zstd_compressor, zstd_level=arg.zstd_level, + sevenzip_compressor=sevenzip_compressor + ) + + print("=== Preparing metadata ===") + metadata, logo_url = prepare_item_metadata(wikidump_dir, config, arg) + + print("=== Checking IA S3 load average (optional) ===") + + try: + avg_load = ia_s3_tasks_load_avg(session=item.session) # check IA load + print(f"IA S3 load: {avg_load * 100:.4f}%") + if avg_load > 0.99: + print("WARNING: IA S3 is heavily overloaded,") + print("To prevent IA S3 from being overloaded further, please try uploading later, exiting...") + sys.exit(99) + elif avg_load > 0.9: + print("WARNING: IA S3 is overloaded, upload may fail") + except Exception as e: + traceback.print_exc() + print(f"Failed to get IA S3 load average: {e}") + print("Don't worry, it's optional.") + + + if arg.dry_run: + print("=== Dry run, exiting ===") + return + + print("=== Uploading ===") + upload_main_resouces(item, filedict, metadata, ia_keys) + + item = get_item(identifier) + if logo_url: + print("=== Uploading logo (optional) ===") + try: + logo_url = urllib.parse.urljoin(config.api or config.index, logo_url) + upload_logo(item, logo_url, ia_keys) + except Exception as e: + traceback.print_exc() + print(f"Failed to upload logo: {e}") + print("Don't worry, it's optional.") + + item = get_item(identifier) + print("=== Updating upload-state ===") + if item.metadata.get("upload-state") != "uploaded": + r = item.modify_metadata({"upload-state": "uploaded"}, access_key=ia_keys.access, secret_key=ia_keys.secret) + assert isinstance(r, requests.Response) + print(r.text) + r.raise_for_status() + print("=== Uploading complete ===") + print(f"identifier: {identifier}") + print(f"URL: https://archive.org/details/{identifier}") + mark_as_done(config, UPLOADED_MARK, msg=f"identifier: {identifier}") + +def upload_logo(item: Item, logo_url: str, ia_keys: IAKeys): + assert logo_url + assert item.identifier + + parsed_url = urllib.parse.urlparse(logo_url) + logo_suff = parsed_url.path.split(".")[-1].lower() + if len(logo_suff) >= 7: + logo_suff = "unknown" + logo_name = item.identifier + "_logo." + logo_suff + for file_ in item.files: + if file_["name"] == logo_name: + print(f"Logo {logo_name} already exists, skip") + return + logo_io = None + for tries_left in range(4, 0, -1): + try: + logo_io = BytesIO(requests.get(logo_url, timeout=20).content) + break + except Exception: + if tries_left == 1: + raise + print(f"Failed to download logo, retrying ({tries_left} tries left)") + time.sleep(3) + + assert logo_io + + r_co = item.upload( + {logo_name: logo_io}, + access_key=ia_keys.access, + secret_key=ia_keys.secret, + verbose=True, + ) + for r_resp in r_co: + assert isinstance(r_resp, requests.Response) + print(r_resp.text) + r_resp.raise_for_status() + +def upload_main_resouces(item: Item, filedict: Dict[str, str], metadata: Dict, ia_keys: IAKeys): + if not filedict: + print("No files to upload, skip") + return + + r_co = item.upload( + files=filedict, + metadata=metadata, + access_key=ia_keys.access, + secret_key=ia_keys.secret, + verbose=True, + queue_derive=False, # disable derive + ) + for r_resp in r_co: + assert isinstance(r_resp, requests.Response) + print(r_resp.text) + r_resp.raise_for_status() + print(f"Uploading {len(filedict)} files: Done.\n") + + identifier = item.identifier + assert identifier + + item = get_item(identifier) # refresh item + tries = 400 + for tries_left in range(tries, 0, -1): + if item.exists: + break + + print(f"Waiting for item to be created ({tries_left} tries left) ...", end='\r') + if tries < 395: + print(f"IA overloaded, still waiting for item to be created ({tries_left} tries left) ...", end='\r') + time.sleep(30) + item = get_item(identifier) + + if not item.exists: + raise TimeoutError(f"IA overloaded, item still not created after {400 * 30} seconds") + +def main(): + parser = argparse.ArgumentParser( + """ Upload wikidump to the Internet Archive.""" + ) + + parser.add_argument("-kf", "--keys_file", default="~/.wikiteam3_ia_keys.txt", dest="keys_file", + help="Path to the IA S3 keys file. (first line: access key, second line: secret key)" + " [default: ~/.wikiteam3_ia_keys.txt]") + parser.add_argument("-c", "--collection", default=DEFAULT_COLLECTION) + parser.add_argument("--dry-run", action="store_true", help="Dry run, do not upload anything.") + parser.add_argument("-u", "--update", action="store_true", + help="Update existing item. [!! not implemented yet !!]") + parser.add_argument("--bin-zstd", default=ZstdCompressor.bin_zstd, dest="bin_zstd", + help=f"Path to zstd binary. [default: {ZstdCompressor.bin_zstd}]") + parser.add_argument("--zstd-level", default=ZstdCompressor.DEFAULT_LEVEL, type=int, choices=range(17, 23), + help=f"Zstd compression level. [default: {ZstdCompressor.DEFAULT_LEVEL}] " + f"If you have a lot of RAM, recommend to use max level (22)." + ) + parser.add_argument("--rezstd", action="store_true", default=ZstdCompressor.rezstd, dest="rezstd", + help="[server-side recompression] Upload pre-compressed zstd files to rezstd server for recompression with " + "best settings (which may eat 10GB+ RAM), then download back. (This feature saves your lowend machine, lol)") + parser.add_argument("--rezstd-endpoint", default=ZstdCompressor.rezstd_endpoint, metavar="URL", dest="rezstd_endpoint", + help=f"Rezstd server endpoint. [default: {ZstdCompressor.rezstd_endpoint}] " + f"(source code: https://github.com/yzqzss/rezstd)" + ) + parser.add_argument("--bin-7z", default=SevenZipCompressor.bin_7z, dest="bin_7z", + help=f"Path to 7z binary. [default: {SevenZipCompressor.bin_7z}] ") + parser.add_argument("--parallel", action="store_true", help="Parallelize compression tasks") + parser.add_argument("wikidump_dir") + + arg = Args(**vars(parser.parse_args())) + print(arg) + upload(arg) + + + + +if __name__ == "__main__": + main() diff --git a/wikiteam3/utils/__init__.py b/wikiteam3/utils/__init__.py new file mode 100644 index 00000000..49837ce5 --- /dev/null +++ b/wikiteam3/utils/__init__.py @@ -0,0 +1,7 @@ +from .util import remove_IP, clean_XML, clean_HTML, undo_HTML_entities, sha1sum + +from .user_agent import get_random_UserAgent +from .identifier import url2prefix_from_config +from .wiki_avoid import avoid_WikiMedia_projects +from .monkey_patch import mod_requests_text +from .login import uniLogin, fetch_login_token, bot_login, client_login, index_login \ No newline at end of file diff --git a/wikiteam3/utils/ia_checker.py b/wikiteam3/utils/ia_checker.py new file mode 100644 index 00000000..e11994eb --- /dev/null +++ b/wikiteam3/utils/ia_checker.py @@ -0,0 +1,96 @@ +import datetime +import logging +from typing import List, Optional +from urllib.parse import urlparse + +from internetarchive import ArchiveSession, Search + +from wikiteam3.dumpgenerator.config import Config + +IA_MAX_RETRY = 5 +logger = logging.getLogger(__name__) + + +def ia_s3_tasks_load_avg(session: ArchiveSession) -> float: + api = "https://s3.us.archive.org/?check_limit=1" + r = session.get(api, timeout=16) + r.raise_for_status() + r_json = r.json() + total_tasks_queued = r_json["detail"]["total_tasks_queued"] + total_global_limit = r_json["detail"]["total_global_limit"] + logger.info(f"ia_s3_load_avg(): {total_tasks_queued} / {total_global_limit}") + return total_tasks_queued / total_global_limit + + +def search_ia(apiurl: Optional[str] = None, indexurl: Optional[str] = None, addeddate_intervals: Optional[List[str]] = None): + if apiurl is None: + apiurl = 'api.php'.join(indexurl.rsplit('index.php', 1)) if indexurl else None + if indexurl is None: + indexurl = 'index.php'.join(apiurl.rsplit('api.php', 1)) if apiurl else None + + if not (apiurl or indexurl): + raise ValueError('apiurl or indexurl must be provided') + + ia_session = ArchiveSession() + + urls_to_check: List[str] = [ + urlparse(url)._replace(scheme=scheme).geturl() + for url in (apiurl, indexurl) if url + for scheme in ('http', 'https') + ] + + query = '(' + ' OR '.join([f'originalurl:"{url}"' for url in urls_to_check]) + ')' + if addeddate_intervals: + query += f' AND addeddate:[{addeddate_intervals[0]} TO {addeddate_intervals[1]}]' + search = Search(ia_session, query=query, + fields=['identifier', 'addeddate', 'title', 'subject', 'originalurl', 'uploader', 'item_size'], + sorts=['addeddate desc'], # newest first + max_retries=IA_MAX_RETRY, # default 5 + ) + item = None + for result in search: # only get the first result + # {'identifier': 'wiki-wikiothingxyz-20230315', + # 'addeddate': '2023-03-15T01:42:12Z', + # 'subject': ['wiki', 'wikiteam', 'MediaWiki', .....]} + if result['originalurl'].lower() in [ + apiurl.lower() if apiurl else None, + indexurl.lower() if indexurl else None + ]: + logger.info(f'Original URL match: {result}') + yield result + item = result + else: + logger.warning(f'Original URL mismatch: {result}') + + if item is None: + logger.warning('No suitable dump found at Internet Archive') + return # skip + + +def search_ia_recent(config: Config, days: int = 365): + + now_utc = datetime.datetime.now(datetime.timezone.utc) + now_utc_iso = now_utc.strftime("%Y-%m-%dT%H:%M:%SZ") + + one_year_ago = now_utc - datetime.timedelta(days=days) + one_year_ago_iso = one_year_ago.strftime("%Y-%m-%dT%H:%M:%SZ") + + addeddate_intervals = [one_year_ago_iso, now_utc_iso] + + for item in search_ia(apiurl=config.api, indexurl=config.index, addeddate_intervals=addeddate_intervals): + yield item + + +def any_recent_ia_item_exists(config: Config, days: int = 365): + for item in search_ia_recent(config=config, days=days): + print('Found an existing dump at Internet Archive') + print(item) + print(f'https://archive.org/details/{item["identifier"]}') + return True + + return False + + +def search_ia_all(config: Config): + for item in search_ia(apiurl=config.api, indexurl=config.index): + yield item diff --git a/wikiteam3/utils/identifier.py b/wikiteam3/utils/identifier.py new file mode 100644 index 00000000..7e47bc89 --- /dev/null +++ b/wikiteam3/utils/identifier.py @@ -0,0 +1,110 @@ +import re +from urllib.parse import urlparse, unquote + +from slugify import slugify + +from wikiteam3.dumpgenerator.config import Config + + +def url2prefix_from_config(config: Config, ascii_slugify: bool = True): + """ + Chose a filename/dirname prefix for the dump based on the API url or INDEX url in the config. + + see `url2prefix()` for details + """ + + if url := config.api: + return url2prefix(url, ascii_slugify=ascii_slugify) + elif url := config.index: + return url2prefix(url, ascii_slugify=ascii_slugify) + else: + raise ValueError('No URL found in config') + + +def standardize_url(url: str, strict: bool = True): + """ 1. strip and unquote url + > raises ValueError if url contains newline (`\\n` or `\\r`) after stripping + 2. Add `http://` if scheme is missing + > if `strict` is True, raises ValueError if scheme is missing + 3. Convert domain to IDNA + 4. Remove port `:80` and `:443` if `http://` and `https://` respectively + """ + # TODO: make step 1,2,4 optional and reversible + + url = url.strip() + if '\n' in url or '\r' in url: + raise ValueError('URL contains newline') + url = unquote(url, encoding='utf-8', errors='strict') + + if not url.startswith('http://') and not url.startswith('https://'): + if strict: + raise ValueError(f'HTTP(s) scheme is missing: {url}') + print('Warning: URL scheme is missing, assuming http://') + url = 'http://' + url + + Url = urlparse(url) + idna_hostname = Url.hostname.encode('idna').decode('utf-8') + + if Url.hostname != idna_hostname: + print('Converting domain to IDNA: ' + Url.hostname + ' -> ' + idna_hostname) + url = url.replace(Url.hostname, idna_hostname, 1) + + if Url.port == 80 and Url.scheme == 'http': + print('Removing port 80 from URL') + url = url.replace(':80', '', 1) + + if Url.port == 443 and Url.scheme == 'https': + print('Removing port 443 from URL') + url = url.replace(':443', '', 1) + + return url + + +def url2prefix(url: str, ascii_slugify: bool = True): + """Convert URL to a valid prefix filename. + + 1. standardize url (see `standardize_url()`) + 2. remove last slash if exists + 3. truncate to last slash + 4. remove "/any.php" suffix + 5. remove ~ tilde + 6. sulgify the url path if `ascii_slugify` is True + 7. replace port(`:`) with underscore(`_`) + 8. lower case + + """ + + url = standardize_url(url) + + r = urlparse(url) + + r_path = r.path + + if r.path.endswith('/'): + # remove last slash + # "/abc/123/" -> "/abc/123" + # "/" -> "" + r_path = r.path[:-1] + else: # not r.path.endswith('/') + # truncate to last slash + # "/abc/123/edf" -> "/abc/123" + r_path = r.path[:r.path.rfind('/')] + + # remove "/any.php" suffix + r_path = re.sub(r"(/[^/]+\.php)", "", r_path) + # remove tilde + r_path = r_path.replace('~', '') + # sulgify + _r_paths = r_path.split('/') + if ascii_slugify: + _r_paths = [slugify(p, separator='_', allow_unicode=False) for p in _r_paths] + r_path = '_'.join(_r_paths) + + # replace port with underscore + r_netloc = r.netloc.replace(':', '_') + + # lower case + prefix = (r_netloc + r_path).lower() + assert prefix == prefix.strip('_'), 'prefix contains leading or trailing underscore, please report this bug.' + + return prefix diff --git a/wikiteam3/utils/login/__init__.py b/wikiteam3/utils/login/__init__.py new file mode 100644 index 00000000..b8024a44 --- /dev/null +++ b/wikiteam3/utils/login/__init__.py @@ -0,0 +1,38 @@ +""" Provide login functions """ + +from typing import Optional +import requests +import time + +from wikiteam3.utils.login.api import bot_login, client_login, fetch_login_token +from wikiteam3.utils.login.index import index_login + + +def uniLogin(api: Optional[str] = '', index: Optional[str] = '' ,session: requests.Session = requests.Session(), username: str = '', password: str = ''): + """ Try to login to a wiki using various methods.\n + Return `session` if success, else return `None`.\n + Try: `cilent login (api) => bot login (api) => index login (index)` """ + + if (not api and not index) or (not username or not password): + raise ValueError('uniLogin: api or index or username or password is empty') + + if api: + print("Trying to log in to the wiki using clientLogin... (MW 1.27+)") + _session = client_login(api=api, session=session, username=username, password=password) + if _session: + return _session + time.sleep(5) + + print("Trying to log in to the wiki using botLogin... (MW 1.27+)") + _session = bot_login(api=api, session=session, username=username, password=password) + if _session: + return _session + time.sleep(5) + + if index: + print("Trying to log in to the wiki using indexLogin... (generic)") + _session = index_login(index=index, session=session, username=username, password=password) + if _session: + return _session + + return None diff --git a/wikiteam3/utils/login/api.py b/wikiteam3/utils/login/api.py new file mode 100644 index 00000000..23d83ee7 --- /dev/null +++ b/wikiteam3/utils/login/api.py @@ -0,0 +1,81 @@ +""" Available since MediaWiki 1.27. login to a wiki using username and password (API) """ + +from typing import Optional + +import requests + + +def fetch_login_token(session: requests.Session, api: str) -> Optional[str]: + """ fetch login token by API .(MediaWiki 1.27+)""" + + response = session.get( + url=api, + params={ + 'action': "query", + 'meta': "tokens", + 'type': "login", + 'format': "json"}) + data = response.json() + try: + token = data['query']['tokens']['logintoken'] + if type(token) is str: + return token + except KeyError: + print('fetch login token: Oops! Something went wrong -- ', data) + return None + + +def client_login(api: str ,session: requests.Session, username: str, password: str) -> Optional[requests.Session]: + """ login to a wiki using username and password. (MediaWiki 1.27+)""" + + login_token = fetch_login_token(session=session, api=api) + if not login_token: + return None + + response = session.post(url=api, data={ + 'action': "clientlogin", + 'username': username, + 'password': password, + 'loginreturnurl': 'http://127.0.0.1:5000/', + 'logintoken': login_token, + 'format': "json" + }) + + data = response.json() + + try: + if data['clientlogin']['status'] == 'PASS': + print('client login: Success! Welcome, ' + data['clientlogin']['username'] + '!') + except KeyError: + print('client login: Oops! Something went wrong -- ', data) + return None + + + return session + + +def bot_login(api:str ,session: requests.Session, username: str, password: str) -> Optional[requests.Session]: + """ login to a wiki using BOT's name and password. (MediaWiki 1.27+) """ + + login_token = fetch_login_token(session=session, api=api) + if not login_token: + return None + + response = session.post(url=api, data={ + 'action': "login", + 'lgname': username, + 'lgpassword': password, + 'lgtoken': login_token, + 'format': "json" + }) + + data = response.json() + + try: + if data['login']['result'] == 'Success': + print('bot login: Success! Welcome, ' + data['login']['lgusername'] + '!') + except KeyError: + print('bot login: Oops! Something went wrong -- ' + data) + return None + + return session \ No newline at end of file diff --git a/wikiteam3/utils/login/index.py b/wikiteam3/utils/login/index.py new file mode 100644 index 00000000..1f61d733 --- /dev/null +++ b/wikiteam3/utils/login/index.py @@ -0,0 +1,52 @@ +""" Always available login methods.(mw 1.16-1.39) + Even oler versions of MW may work, but not tested. """ + +from typing import Optional + +import lxml.html +import requests + + +def index_login(index:str ,session: requests.Session, username: str, password: str) -> Optional[requests.Session]: + """ Try to login to a wiki using username and password through `Special:UserLogin`. + (tested on MW 1.16...1.39) """ + wpEditToken = None + wpLoginToken = None + + params = { + 'title': 'Special:UserLogin', + } + r = session.get(index, allow_redirects=True, params=params) + + # Sample r.text: + # MW 1.16: + # MW 1.39: + html = lxml.html.fromstring(r.text) + if 'wpLoginToken' in r.text: + wpLoginToken = html.xpath('//input[@name="wpLoginToken"]/@value')[0] + + # Sample r.text: + # MW 1.16: None + # MW 1.39: + if 'wpEditToken' in r.text: + wpEditToken = html.xpath('//input[@name="wpEditToken"]/@value')[0] + print('index login: wpEditToken found.') + + data = { + 'wpName': username, # required + 'wpPassword': password, # required + 'wpLoginattempt': 'Log in', # required + 'wpLoginToken': wpLoginToken, # required + 'wpRemember': '1', # 0: not remember, 1: remember + 'wpEditToken': wpEditToken, # introduced before MW 1.27, not sure whether it's required. + 'authAction': 'login', # introduced before MW 1.39. + 'title': 'Special:UserLogin', # introduced before MW 1.39. + 'force': '', # introduced before MW 1.39, empty string is OK. + } + r = session.post(index, allow_redirects=False, params=params, data=data) + if r.status_code == 302: + print('index login: Success! Welcome, ', username, '!') + return session + else: + print('index login: Oops! Something went wrong -- ', r.status_code, 'wpLoginToken: ', wpLoginToken, 'wpEditToken: ', wpEditToken) + return None \ No newline at end of file diff --git a/wikiteam3/utils/monkey_patch.py b/wikiteam3/utils/monkey_patch.py new file mode 100644 index 00000000..d8fdcaec --- /dev/null +++ b/wikiteam3/utils/monkey_patch.py @@ -0,0 +1,191 @@ +import os +import ssl +import time +from typing import Optional +import warnings + +import requests +import requests.adapters +from urllib3.util import create_urllib3_context +from urllib3 import PoolManager + +from wikiteam3.dumpgenerator.cli.delay import Delay +from wikiteam3.dumpgenerator.config import Config + +def mod_requests_text(requests: requests): # type: ignore + """ + - Monkey patch `requests.Response.text` to handle incorrect encoding. + - Replace error characters with � (U+FFFD) if them are not too many. ($WIKITEAM3_REQUESTS_TEXT_FFFD_TOLERANCE) + """ + def new_text(_self: requests.Response): + # Handle incorrect encoding + encoding = _self.encoding + if encoding is None or encoding == 'ISO-8859-1': + encoding = _self.apparent_encoding + if encoding is None: + encoding = 'utf-8' + if _self.content.startswith(b'\xef\xbb\xbf'): + content = _self.content.lstrip(b'\xef\xbb\xbf') + encoding = "utf-8" + else: + content = _self.content + + try: + return content.decode(encoding, errors="strict") + except UnicodeDecodeError as e: + FFFD_CHAR = u'�' + FFFD_TOLERANCE = float(os.environ.get('WIKITEAM3_REQUESTS_TEXT_FFFD_TOLERANCE', '0.01')) + assert 0 <= FFFD_TOLERANCE <= 1 + print('UnicodeDecodeError:', e) + ignore_text = content.decode(encoding, errors='ignore') + FFFDs_in_ignore_text = ignore_text.count(FFFD_CHAR) + replace_text = content.decode(encoding, errors='replace') + FFFDs_in_replace_text = replace_text.count(FFFD_CHAR) + + bad_FFFDs = FFFDs_in_replace_text - FFFDs_in_ignore_text + bad_FFFDs_ratio = bad_FFFDs / len(replace_text) + + if bad_FFFDs_ratio > FFFD_TOLERANCE: + print(f"ERROR: Bad \\ufffd too many. {bad_FFFDs} bad FFFDs in {len(replace_text)} chars ({bad_FFFDs_ratio}) " + "Check the encoding or set $WIKITEAM3_REQUESTS_TEXT_FFFD_TOLERANCE to a higher value.") + raise e + + warnings.warn( + message=f"found bad \\ufffd, but tolerable. {bad_FFFDs} bad FFFDs in {len(replace_text)} chars ({bad_FFFDs_ratio})", + category=UserWarning + ) + return replace_text + + + requests.Response.text = property(new_text) # type: ignore + + +class WakeTLSAdapter(requests.adapters.HTTPAdapter): + """ + Workaround for bad SSL/TLS + """ + def init_poolmanager(self, connections, maxsize, block=False): + # https://docs.openssl.org/master/man1/openssl-ciphers/ + ctx = create_urllib3_context(ciphers="ALL:COMPLEMENTOFDEFAULT:COMPLEMENTOFALL:eNULL:@SECLEVEL=0") + + ctx.options &= ~ssl.OP_NO_TLSv1_3 & ~ssl.OP_NO_TLSv1_2 & ~ssl.OP_NO_TLSv1_1 & ~ssl.OP_NO_TLSv1 + with warnings.catch_warnings(): + warnings.filterwarnings("ignore",category=DeprecationWarning) + ctx.minimum_version = ssl.TLSVersion.TLSv1 + + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + self.poolmanager = PoolManager( + num_pools=connections, + maxsize=maxsize, + block=block, + ssl_context=ctx + ) + +class SessionMonkeyPatch: + """ + Monkey patch `requests.Session.send` + """ + hijacked = False + def __init__(self,*, session: requests.Session, config: Optional[Config]=None, + add_delay: bool=False, delay_msg: Optional[str]=None, + hard_retries: int=0, + free_timeout_connections: bool=True, vaild_lft_sec: int=60 * 3, + accept_encoding: str="", + ): + """ + hard_retries: hard retries, default 0 (no retry) + free_timeout_connections: regularly(`vaild_lft_sec`) clear connections pool + """ + + self.session = session + self.config = config + + self.add_delay = add_delay + self.delay_msg = delay_msg + + self.hard_retries = hard_retries + + self.free_timeout_connections: bool = free_timeout_connections + self.vaild_lft_sec = vaild_lft_sec + self.last_clear_time = time.time() + + self.accept_encoding = accept_encoding + + def clear_timeouted_pools(self): + for adapter in self.session.adapters.values(): + adapter: requests.adapters.HTTPAdapter + if adapter.poolmanager.pools._container.__len__() > 0 and \ + time.time() - self.last_clear_time > self.vaild_lft_sec: + # TODO: logging this + # print('Keep-alived timeout: %d' % adapter.poolmanager.pools._container.__len__(), "connection(s) dropped.") + adapter.poolmanager.clear() # clear all + self.last_clear_time = time.time() + + def hijack(self): + ''' Don't forget to call `release()` ''' + + # Monkey patch `requests.Session.send` + self.old_send_method = self.session.send + + def new_send(request: requests.PreparedRequest, **kwargs): + hard_retries_left = self.hard_retries + 1 + if hard_retries_left <= 0: + raise ValueError('hard_retries must be positive') + + accept_encoding = '' + + while hard_retries_left > 0: + try: + if self.add_delay: + Delay(msg=self.delay_msg, config=self.config) + + if self.free_timeout_connections: + self.clear_timeouted_pools() + + if _accept_encoding := accept_encoding or self.accept_encoding or request.headers.get("Accept-Encoding", ""): + request.headers["Accept-Encoding"] = _accept_encoding + + return self.old_send_method(request, **kwargs) + except (KeyboardInterrupt, requests.exceptions.ContentDecodingError): # don't retry + raise + except Exception as e: + hard_retries_left -= 1 + if hard_retries_left <= 0: + raise + + print('Hard retry... (%d), due to: %s' % (hard_retries_left, e)) + + # workaround for https://wiki.erischan.org/index.php/Main_Page and other ChunkedEncodingError sites + if isinstance(e, requests.exceptions.ChunkedEncodingError): + accept_encoding = 'identity' + print('retry with Accept-Encoding:', accept_encoding) + + # if --bypass-cdn-image-compression is enabled, retry with different url + assert isinstance(request.url, str) + if '_wikiteam3_nocdn=' in request.url: + request.url = request.url.replace('_wikiteam3_nocdn=init_req', f'_wikiteam3_nocdn=retry_{hard_retries_left}') + request.url = request.url.replace( + f'_wikiteam3_nocdn=retry_{hard_retries_left + 1}', + f'_wikiteam3_nocdn=retry_{hard_retries_left}' + ) + print('--bypass-cdn-image-compression: change url to', request.url, 'on hard retry...') + + time.sleep(3) + + self.session.send = new_send # type: ignore + self.hijacked = True + + def release(self): + ''' Undo monkey patch ''' + if not self.hijacked: + warnings.warn('Warning: SessionMonkeyPatch.release() called before hijack()', RuntimeWarning) + return + self.session.send = self.old_send_method + del self + + def __del__(self): + if self.hijacked: + print('Undo monkey patch...') + self.release() diff --git a/wikiteam3/utils/user_agent.py b/wikiteam3/utils/user_agent.py new file mode 100644 index 00000000..0b8a0bcf --- /dev/null +++ b/wikiteam3/utils/user_agent.py @@ -0,0 +1,325 @@ +import random + +import requests + + +def get_UserAgents(): + """Return a cool user-agent to hide Python user-agent""" + useragents = [ + # firefox + # 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0', + # 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0', + # "Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0", + # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0', + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.22 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.22 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.42 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.42 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.53 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.53 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.68 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.68 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.72 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.72 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.20 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.20 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.47 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.47 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.60 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.60 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.63 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.63 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.41 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.41 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.50 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.50 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.79 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.79 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.87 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.87 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.23 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.23 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.47 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.47 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.58 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.58 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.69 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.69 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.19 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.19 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.70 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.70 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.19 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.19 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.38 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.38 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.57 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.57 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.69 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.69 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.59 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.59 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.70 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.70 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.80 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.80 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.93 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.93 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.101 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.101 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.18 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.18 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.25 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.25 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.42 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.42 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.58 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.58 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.20 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.20 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.31 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.31 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.41 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.41 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.50 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.50 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.17 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.17 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.32 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.32 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.49 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.49 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.18 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.18 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.35 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.35 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.20 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.20 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.36 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.36 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.45 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.45 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.56 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.56 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.48 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.48 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.54 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.54 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.66 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.66 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.74 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.74 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.17 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.17 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.35 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.35 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.45 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.45 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.20 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.20 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.46 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.46 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.56 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.56 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.15 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.15 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.26 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.26 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.34 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.34 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.27 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.49 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.49 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.24 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.24 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.33 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.33 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.42 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.42 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.20 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.20 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.29 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.29 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.39 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.39 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.48 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.48 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.57 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.57 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.65 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.65 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.19 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.19 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.28 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.28 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.37 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.37 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.52 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.5195.52 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.21 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.21 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.61 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.61 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5304.18 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5304.18 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5304.29 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5304.29 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5304.36 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5304.36 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5304.62 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5304.62 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5304.68 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5304.68 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.22 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.22 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.40 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.48 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.48 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.62 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.62 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.71 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.71 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.25 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.25 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.36 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.36 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.46 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.46 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.61 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.61 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.74 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5414.74 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.30 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", + ] + return useragents + +def get_random_UserAgent(): + return random.choice(get_UserAgents()) + +def setup_random_UserAgent(session: requests.Session): + session._orirequest = session.request + def newrequest(*args, **kwargs): + session.headers.update({"User-Agent": get_random_UserAgent()}) + return session._orirequest(*args, **kwargs) + session.request = newrequest \ No newline at end of file diff --git a/wikiteam3/utils/util.py b/wikiteam3/utils/util.py new file mode 100644 index 00000000..cb5a8066 --- /dev/null +++ b/wikiteam3/utils/util.py @@ -0,0 +1,146 @@ +import datetime +import hashlib +from pathlib import Path +import re +import sys +from typing import Optional, Union + +from wikiteam3.dumpgenerator.config import Config + +ALL_DUMPED_MARK = "all_dumped.mark" +UPLOADED_MARK = 'uploaded_to_IA.mark' +XMLRIVISIONS_INCREMENTAL_DUMP_MARK = 'xmlrevisions_incremental_dump.mark' + +ALL_NAMESPACE_FLAG = "all" +""" DO NOT CHNAGE THIS VALUE, this magic value is used to work with config.json \n\n"""\ +""" I want use "*" as magic flag like MediaWiki does,"""\ +""" but "all" exists in wikiteam codebase years ago :( """ + + +def underscore(text: str) -> str: + """ replace(" ", "_") """ + return text.replace(" ", "_") + +def space(text: str) -> str: + """ replace("_", " ") """ + return text.replace("_", " ") + + +def clean_HTML(raw: str = "") -> str: + """Extract only the real wiki content and remove rubbish + This function is ONLY used to retrieve page titles + and file names when no API is available + DO NOT use this function to extract page content""" + # different "tags" used by different MediaWiki versions to mark where + # starts and ends content + if re.search("", raw): + raw = raw.split("")[1].split("")[0] + elif re.search("", raw): + raw = raw.split("")[1].split("")[0] + elif re.search("", raw): + raw = raw.split("")[1].split( + "" + )[0] + elif re.search("", raw): + raw = raw.split("")[1].split("")[0] + elif re.search(r'
      ', raw): + raw = raw.split('
      ')[ + 1 + ].split("
      ")[0] + elif re.search("')[0] + else: + print(raw[:250]) + print("This wiki doesn't use marks to split content") + sys.exit(1) + return raw + + +def undo_HTML_entities(text: str = "") -> str: + """Undo some HTML codes""" + + # i guess only < > & " ' need conversion + # http://www.w3schools.com/html/html_entities.asp + text = re.sub("<", "<", text) + text = re.sub(">", ">", text) + text = re.sub("&", "&", text) + text = re.sub(""", '"', text) + text = re.sub("'", "'", text) + + return text + + +def remove_IP(raw: str = "") -> str: + """Remove IP from HTML comments """ + + raw = re.sub(r"\d+\.\d+\.\d+\.\d+", "0.0.0.0", raw) + # http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html + # weird cases as :: are not included + raw = re.sub( + r"(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}", + "0:0:0:0:0:0:0:0", + raw, + ) + + return raw + + +def clean_XML(xml: str = "") -> str: + """Trim redundant info from the XML however it comes""" + # do not touch XML codification, leave AS IS + + if re.search(r"\n", xml): + xml = xml.split("\n")[1] + if re.search(r"", xml): + xml = xml.split("")[0] + return xml + + +def sha1bytes(data: bytes) -> str: + """ return `hashlib.sha1(data).hexdigest()` """ + return hashlib.sha1(data).hexdigest() + + +def sha1sum(path: Union[str, Path]) -> str: + """ Return the SHA1 hash of a file """ + if isinstance(path, str): + path = Path(path).expanduser().resolve() + + if not path.is_file(): + raise FileNotFoundError(f"File {path} does not exist or is not a file") + + sha1 = hashlib.sha1() + with open(path, "rb") as f: + while True: + data = f.read(65536) + if not data: + break + sha1.update(data) + return sha1.hexdigest() + +def mark_as_done(config: Config, mark: str, msg: Optional[str] = None): + done_path = f"{config.path}/{mark}" + if Path(done_path).exists(): + return + with open(done_path, "w") as f: + today = datetime.datetime.isoformat(datetime.datetime.now(datetime.timezone.utc)) + f.write(f"{today}: {msg or ''}\n") + + return True + +def is_markfile_exists(config: Config, mark: str) -> bool: + return (Path(config.path)/ mark).exists() + +def int_or_zero(size: Union[int, str]) -> int: + return int(size) if ( + size + and ( + (isinstance(size, str) and size.isdigit()) + or + (isinstance(size, int)) + ) + ) else 0 + +def is_empty_dir(path: Union[str, Path]) -> bool: + assert Path(path).is_dir() + return not any(Path(path).iterdir()) diff --git a/wikiteam3/utils/wiki_avoid.py b/wikiteam3/utils/wiki_avoid.py new file mode 100644 index 00000000..039d6f64 --- /dev/null +++ b/wikiteam3/utils/wiki_avoid.py @@ -0,0 +1,49 @@ +import re +import sys +import urllib.robotparser +from urllib.parse import urlparse + +import requests + +from wikiteam3.dumpgenerator.config import Config, OtherConfig + +def avoid_WikiMedia_projects(config: Config, other: OtherConfig): + """Skip Wikimedia projects and redirect to the dumps website""" + + # notice about wikipedia dumps + url = "" + if config.api: + url = url + config.api + if config.index: + url = url + config.index + if re.findall( + r"(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org", + url, + ): + print("PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!") + print("Download the dumps from http://dumps.wikimedia.org") + if not other.force: + print("Thanks!") + sys.exit(2) + +def avoid_robots_disallow(config: Config, other: OtherConfig): + """Check if the robots.txt allows the download""" + url = config.api or config.index + exit_ = False + bot = urllib.robotparser.RobotFileParser() + try: + # Don't use the session.get() method here, since we want to avoid the session's retry logic + r = requests.get( + urlparse(url).scheme + '://' + urlparse(url).netloc + '/robots.txt', + cookies=other.session.cookies, headers=other.session.headers, verify=other.session.verify, proxies=other.session.proxies + ) + if r.status_code == 200: + bot.parse(r.text.splitlines()) + if not bot.can_fetch('wikiteam3', '/') and 'wikiteam3' in r.text: + print('This wiki not allow wikiteam3 to archive.') + exit_ = True + except Exception as e: + print('Error: cannot get robots.txt', e) + + if exit_: + sys.exit(20)