From 6670c94781de3b8f73cbbbb9f7e58487d5ab7cbd Mon Sep 17 00:00:00 2001 From: Tim Reichelt Date: Thu, 10 Aug 2017 08:42:49 +0000 Subject: [PATCH] Initial commit --- .gitignore | 1 + LOCATIONS.md | 46 ++ TODO | 3 + deduped/README.md | 10 + deduped/compress_shard.sh | 20 + deduped/deduped_from_shard.sh | 30 ++ deduped/shard_fifo.sh | 23 + download/README.md | 3 + download/count_downloads.sh | 19 + download/download_wet.sh | 14 + download/setup.sh | 21 + raw/README.md | 25 + raw/collect_langs.py | 98 ++++ raw/collect_monolingual.sh | 295 +++++++++++ raw/create_raw.sh | 36 ++ raw/create_raw_en.sh | 31 ++ s3/FILES.md | 14 + s3/check_deduped_en.sh | 14 + s3/check_raw_uploads.sh | 14 + s3/check_raw_uploads_en.sh | 7 + s3/continue_multipart.sh | 8 + s3/copy_deduped.py | 24 + s3/copy_deduped_en.py | 24 + s3/count_uploads.sh | 23 + s3/create_md5_sums.sh | 7 + s3/index_files/index_lm.html | 18 + s3/index_files/index_raw.html | 926 ++++++++++++++++++++++++++++++++++ s3/index_files/raw_links | 914 +++++++++++++++++++++++++++++++++ s3/make_non_en_public.sh | 8 + s3/rename_raw.sh | 43 ++ s3/s3_copy.py | 24 + s3/upload_deduped.sh | 9 + s3/upload_raw.sh | 8 + 33 files changed, 2760 insertions(+) create mode 100644 .gitignore create mode 100644 LOCATIONS.md create mode 100644 TODO create mode 100644 deduped/README.md create mode 100755 deduped/compress_shard.sh create mode 100755 deduped/deduped_from_shard.sh create mode 100755 deduped/shard_fifo.sh create mode 100644 download/README.md create mode 100755 download/count_downloads.sh create mode 100755 download/download_wet.sh create mode 100755 download/setup.sh create mode 100644 raw/README.md create mode 100755 raw/collect_langs.py create mode 100755 raw/collect_monolingual.sh create mode 100755 raw/create_raw.sh create mode 100755 raw/create_raw_en.sh create mode 100644 s3/FILES.md create mode 100755 s3/check_deduped_en.sh create mode 100755 s3/check_raw_uploads.sh create mode 100644 s3/check_raw_uploads_en.sh create mode 100755 s3/continue_multipart.sh create mode 100755 s3/copy_deduped.py create mode 100755 s3/copy_deduped_en.py create mode 100755 s3/count_uploads.sh create mode 100755 s3/create_md5_sums.sh create mode 100644 s3/index_files/index_lm.html create mode 100644 s3/index_files/index_raw.html create mode 100644 s3/index_files/raw_links create mode 100755 s3/make_non_en_public.sh create mode 100755 s3/rename_raw.sh create mode 100755 s3/s3_copy.py create mode 100755 s3/upload_deduped.sh create mode 100755 s3/upload_raw.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c92d00f --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +jhu/ diff --git a/LOCATIONS.md b/LOCATIONS.md new file mode 100644 index 0000000..39a8bf9 --- /dev/null +++ b/LOCATIONS.md @@ -0,0 +1,46 @@ +# Locations of data + +## /fs/zisa0/commoncrawl + +- 2015_27 raw non-english +- 2016_30 raw non-english +- 2017_17 experiments with extracting parallel text + +## /fs/freyja0/commoncrawl + +- 2015_06 raw non-english +- 2015_27 langsplit files +- 2015_30 langsplit files +- 2017_17 langsplit files + +## /fs/mimir0/commoncrawl + +- 2015_06 english raw +- 2015_11, 2015_14, 2015_18, 2015_22, 2015_27, 2015_27, 2015_32, 2015_35, 2015_40, 2015_48, 2016_50, 2017_17 all raw + +## /fs/nas/tim/cc + +- 2015_11, 2015_14 english raw +- deduped files for ar, cs, de, es, fr, it, pl, ru + +## /fs/nas/heithrun0/commoncrawl/langsplit + +- langsplit files for all crawls from 2013_20 up to 2015_48 and for 2016_50 +- some scripts and files from Christian which seem to be related to the parallel corpus extraction + +## /fs/vili0/buck/cc/langsplit2/raw + +- non-english raw files for all 2014 crawls + +## /fs/vili0/buck/cc/langsplit2 and /fs/vili0/buck/cc/langsplit + +- temporary data between the langsplit files and the raw files for 2014 and 2015 crawls, potential candidate for deletion + +## /fs/vili0/www/data.statmt.org/ngrams + +- home directory of the "data.statmt.org/ngrams" website, contains symbolic links to old raw data + +## /fs/gna0/buck/cc/db + +- contains RocksDB Index data for all crawls from 2012 to 2015_40 + 2016_50; used in the parallel corpus extraction pipeline + diff --git a/TODO b/TODO new file mode 100644 index 0000000..1b98c39 --- /dev/null +++ b/TODO @@ -0,0 +1,3 @@ +- Create deduped files for all minor languages that are not present yet +- Update english deduped files +- Copy the English language trie on AWS S3 diff --git a/deduped/README.md b/deduped/README.md new file mode 100644 index 0000000..45d7746 --- /dev/null +++ b/deduped/README.md @@ -0,0 +1,10 @@ +# Deduping .raw files + +## Dedupe + + +## Shard and dedupe + +If the all of the raw data of one language is too big to fit into memory we have to shard the raw into multiple files. This is usually done with English. +Before the sharding we do some minor processing of the raw data which removes lines with the document delimiter hash (df6fa1abb58549287111ba8d776733e9), +strip leading and trailing white space and remove lines with invalid UTF-8. diff --git a/deduped/compress_shard.sh b/deduped/compress_shard.sh new file mode 100755 index 0000000..9d045db --- /dev/null +++ b/deduped/compress_shard.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -e +set -o pipefail + +ID="$1" +SHARD_DIR="$2" +OUT_DIR="$3" + +INPUT_FILE="${SHARD_DIR}/en.tmp${ID}" +OUTPUT_FILE="${OUT_DIR}/en.tmp${ID}.gz" +DONEFILE="${OUTPUT_FILE}.done" + +if [[ -f "${DONEFILE}" ]]; then + exit 0 +fi + +< "${INPUT_FILE}" gzip -c > "${OUTPUT_FILE}" + +touch "${DONEFILE}" diff --git a/deduped/deduped_from_shard.sh b/deduped/deduped_from_shard.sh new file mode 100755 index 0000000..ee82961 --- /dev/null +++ b/deduped/deduped_from_shard.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e +set -o pipefail + +# Non-zero padded id. +ID="$1" +SHARD_DIR="$2" +PREVIOUS_DEDUPED_DIR="$3" +OUT_DIR="$4" + +PREPROCESS_DIR="/fs/zisa0/tim/dev/preprocess/bin" + +PADDED_ID=$(printf "%02d" ${ID}) +INPUT_FILE="${SHARD_DIR}/en.tmp${ID}" +OUTPUT_FILE="${OUT_DIR}/en.${PADDED_ID}.deduped.xz" +DONEFILE="${OUTPUT_FILE}.done" + +PREVIOUS_DEDUPED_FILE="${PREVIOUS_DEDUPED_DIR}/en.${PADDED_ID}.deduped.xz" + +if [[ -f "${DONEFILE}" ]]; then + exit 0 +fi + + +<"${INPUT_FILE}" ${PREPROCESS_DIR}/commoncrawl_dedupe ${PREVIOUS_DEDUPED_FILE} | xz > "${OUTPUT_FILE}" + +rm "${INPUT_FILE}" + +touch "${DONEFILE}" diff --git a/deduped/shard_fifo.sh b/deduped/shard_fifo.sh new file mode 100755 index 0000000..4f7a8d8 --- /dev/null +++ b/deduped/shard_fifo.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -e +set -o pipefail + +RAW_DIR="$1" +TMP_DIR="$2" + +PREPROCESS_DIR="/fs/zisa0/tim/dev/preprocess/bin" +RAW_FILES="${RAW_DIR}/*.raw.xz" + +TMP_PREFIX="en.tmp" +SHARD_COUNT=100 + +# Create named pipes +for i in $(seq 0 $((SHARD_COUNT-1))); do + mkfifo "${TMP_DIR}/${TMP_PREFIX}${i}" +done + +# Clean raw files and shard them into pipes +/fs/zisa0/tim/bin/xz -T10 -cd ${RAW_FILES} | \ + ${PREPROCESS_DIR}/commoncrawl_clean | \ + ${PREPROCESS_DIR}/shard_fifo ${TMP_DIR}/${TMP_PREFIX} ${SHARD_COUNT} diff --git a/download/README.md b/download/README.md new file mode 100644 index 0000000..99b4b56 --- /dev/null +++ b/download/README.md @@ -0,0 +1,3 @@ +# Download CommonCrawl data + +Scripts for the monolingual pipeline as described in [here](https://github.com/ModernMT/DataCollection/blob/master/metadata/metadata.md). `setup.sh` creates all necessary directories and downloads all target urls. `download.sh` does the actual download. `count_downloads.sh` counts how many of the files are already downloaded. diff --git a/download/count_downloads.sh b/download/count_downloads.sh new file mode 100755 index 0000000..ca222c2 --- /dev/null +++ b/download/count_downloads.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e +set -o pipefail + +total=0 +downloaded=0 +echo "$total"; echo -en "\e[1A" +for path in `cat $1`; do + echo -e "\e[0K\r $total"; echo -en "\e[1A" + total=$((total+1)) + FILENAME=$(echo $path | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF)}') + if [ -f ${FILENAME}.done ]; then + downloaded=$((downloaded+1)) + fi +done + +echo "$downloaded/$total" +echo "Downloaded/Total" diff --git a/download/download_wet.sh b/download/download_wet.sh new file mode 100755 index 0000000..73c1f43 --- /dev/null +++ b/download/download_wet.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e +set -o pipefail + +FILENAME=$(echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF)}') + +if [ ! -f ${FILENAME}.done ]; then + curl -s $1 | gzip -cd | \ + /fs/nas/heithrun0/commoncrawl/langsplit/bin/read_wet.py | \ + /fs/nas/heithrun0/commoncrawl/langsplit/bin/langsplit --printchunks 2> /dev/null | \ + xz -9 -e -T 2 > ${FILENAME}.langsplit.xz + touch ${FILENAME}.done +fi diff --git a/download/setup.sh b/download/setup.sh new file mode 100755 index 0000000..5541e3d --- /dev/null +++ b/download/setup.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -e +set -o pipefail + + +YEAR=$(echo $1 | awk ' BEGIN { FS = "_" } { print $1 }') +WEEK=$(echo $1 | awk ' BEGIN { FS = "_" } { print $2 }') + +# Make directory for specified crawl +mkdir -p ${1}/wet +cd ${1}/wet + +# Download path file +wget https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-${YEAR}-${WEEK}/wet.paths.gz + +# Convert to HTTPS URLs +gzip -cd wet.paths.gz | sed 's/^/https:\/\/commoncrawl.s3.amazonaws.com\//' > wet.paths.http + +# Make subdirectories +for f in `gzip -cd wet.paths.gz | cut -d '/' -f 4 | sort | uniq`; do mkdir -p $f; done; diff --git a/raw/README.md b/raw/README.md new file mode 100644 index 0000000..54e2f4e --- /dev/null +++ b/raw/README.md @@ -0,0 +1,25 @@ +# Creating .raw files + +## High-level description + +This pipeline takes the `*.langsplit.xz` files as input. Note that each crawl from CommonCrawl is usually split into 100 different shards. +However, this number is not necessarily consistent among all crawls (e.g. sometimes it might be 98). Each of those 100 different shards is in turn split into +several hundred files. For each of these files we have one `.langsplit.xz` file. + +The script `collect_monolingual.sh` takes as input the directory name of one shard and reads all the `.langsplit.xz` files in that directory and splits them +according to language. The second argument of this script is the output directory. For each language `collect_monolingual.sh` writes a files with the name +`text.${language}.gz` to the output directory. + +Now since `collect_monolingual.sh` is called on each of the 100 shards separately we still have to concatenate all the different `text.${language}.gz` files +into one big `${language}.raw.xz` file. This is done with the `create_raw.sh` script. There is a separate `create_raw_en.sh` since we want to create 100 raw files +for English because a single raw file for English would be too large. + +## Running the pipeline + +```bash +ls * | parallel ./collect_monolingual.sh {} {} +``` + +```bash +cat language.codes | parallel $crawl_dir $out_dir {} +``` diff --git a/raw/collect_langs.py b/raw/collect_langs.py new file mode 100755 index 0000000..513dabc --- /dev/null +++ b/raw/collect_langs.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import sys +import argparse + +magic_number = 'df6fa1abb58549287111ba8d776733e9' + +cld2_langcodes = ['en', 'da', 'nl', 'fi', 'fr', 'de', 'iw', 'it', + 'ja', 'ko', 'no', 'pl', 'pt', 'ru', 'es', 'sv', + 'zh', 'cs', 'el', 'is', 'lv', 'lt', 'ro', 'hu', + 'et', 'xxx', 'un', 'bg', 'hr', 'sr', 'ga', 'gl', + 'tl', 'tr', 'uk', 'hi', 'mk', 'bn', 'id', 'la', + 'ms', 'ml', 'cy', 'ne', 'te', 'sq', 'ta', 'be', + 'jw', 'oc', 'ur', 'bh', 'gu', 'th', 'ar', 'ca', + 'eo', 'eu', 'ia', 'kn', 'pa', 'gd', 'sw', 'sl', + 'mr', 'mt', 'vi', 'fy', 'sk', 'zh-Hant', 'fo', + 'su', 'uz', 'am', 'az', 'ka', 'ti', 'fa', 'bs', + 'si', 'nn', 'xh', 'zu', 'gn', 'st', 'tk', 'ky', + 'br', 'tw', 'yi', 'so', 'ug', 'ku', 'mn', 'hy', + 'lo', 'sd', 'rm', 'af', 'lb', 'my', 'km', 'bo', + 'dv', 'chr', 'syr', 'lif', 'or', 'as', 'co', + 'ie', 'kk', 'ln', 'mi', 'wo', 'ab', 'aa', 'ay', + 'ba', 'bi', 'dz', 'fj', 'kl', 'ha', 'ht', 'ik', + 'iu', 'ks', 'rw', 'mg', 'na', 'om', 'rn', 'sm', + 'sg', 'sa', 'ss', 'ts', 'tn', 'vo', 'za', 'kha', + 'sco', 'lg', 'gv', 'sr-ME', 'ak', 'ig', 'mfe', + 'haw', 'ceb', 'ee', 'gaa', 'blu', 'kri', 'loz', + 'lua', 'luo', 'new', 'ny', 'os', 'pam', 'nso', + 'raj', 'crs', 'tum', 've', 'war', 'nr', 'zzb', + 'zzp', 'zzh', 'tlh', 'zze', 'xx-Zyyy', 'xx-Latn', + 'xx-Grek', 'xx-Cyrl', 'xx-Armn', 'xx-Hebr', + 'xx-Arab', 'xx-Syrc', 'xx-Thaa', 'xx-Deva', + 'xx-Beng', 'xx-Guru', 'xx-Gujr', 'xx-Orya', + 'xx-Taml', 'xx-Telu', 'xx-Knda', 'xx-Mlym', + 'xx-Sinh', 'xx-Thai', 'xx-Laoo', 'xx-Tibt', + 'xx-Mymr', 'xx-Geor', 'xx-Hang', 'xx-Ethi', + 'xx-Cher', 'xx-Cans', 'xx-Ogam', 'xx-Runr', + 'xx-Khmr', 'xx-Mong', 'xx-Hira', 'xx-Kana', + 'xx-Bopo', 'xx-Hani', 'xx-Yiii', 'xx-Ital', + 'xx-Goth', 'xx-Dsrt', 'xx-Qaai', 'xx-Tglg', + 'xx-Hano', 'xx-Buhd', 'xx-Tagb', 'xx-Limb', + 'xx-Tale', 'xx-Linb', 'xx-Ugar', 'xx-Shaw', + 'xx-Osma', 'xx-Cprt', 'xx-Brai', 'xx-Bugi', + 'xx-Copt', 'xx-Talu', 'xx-Glag', 'xx-Tfng', + 'xx-Sylo', 'xx-Xpeo', 'xx-Khar', 'xx-Bali', + 'xx-Xsux', 'xx-Phnx', 'xx-Phag', 'xx-Nkoo', + 'xx-Sund', 'xx-Lepc', 'xx-Olck', 'xx-Vaii', + 'xx-Saur', 'xx-Kali', 'xx-Rjng', 'xx-Lyci', + 'xx-Cari', 'xx-Lydi', 'xx-Cham', 'xx-Lana', + 'xx-Tavt', 'xx-Avst', 'xx-Egyp', 'xx-Samr', + 'xx-Lisu', 'xx-Bamu', 'xx-Java', 'xx-Mtei', + 'xx-Armi', 'xx-Sarb', 'xx-Prti', 'xx-Phli', + 'xx-Orkh', 'xx-Kthi', 'xx-Batk', 'xx-Brah', + 'xx-Mand', 'xx-Cakm', 'xx-Merc', 'xx-Mero', + 'xx-Plrd', 'xx-Shrd', 'xx-Sora', 'xx-Takr'] +cld2_langcodes = [lc.replace('-', '_') for lc in cld2_langcodes] + +parser = argparse.ArgumentParser() +for lc in cld2_langcodes: + parser.add_argument("-%s" % lc, + help="outfile for %s data" % lc, + type=argparse.FileType('wb')) +args = parser.parse_args() + +lang2file = {} +for lc in cld2_langcodes: + if getattr(args, lc) is not None: + lang2file[lc] = getattr(args, lc) + + +buf = [] +current_lang = None + +for line in sys.stdin: + if line.startswith(magic_number): + if buf: + assert current_lang is not None + lang2file[current_lang].write("".join(buf)) + + current_lang = None + buf = [] + + for kv in line.strip().split(): + if kv.startswith("language:"): + lang = kv.split(':', 1)[1] + if lang in lang2file: + current_lang = lang + + if current_lang: + buf.append(line) + +if buf: + assert current_lang is not None + lang2file[current_lang].write("".join(buf)) + +for _, lang_file in lang2file.iteritems(): + lang_file.flush() + lang_file.close() diff --git a/raw/collect_monolingual.sh b/raw/collect_monolingual.sh new file mode 100755 index 0000000..b69d3aa --- /dev/null +++ b/raw/collect_monolingual.sh @@ -0,0 +1,295 @@ +#!/bin/bash + +# Exit as soon as any command fails +set -e +set -o pipefail + +BINDIR=/fs/freyja0/commoncrawl + +DATADIR=$1 +OUTDIR=$2 + +mkdir -p ${OUTDIR} + +DONEFILE=${OUTDIR}/langsplit.done + +if [ ! -f ${DONEFILE} ]; then + xzcat ${DATADIR}/*.langsplit.xz | ${BINDIR}/collect_langs.py \ + -en >(pigz -9 >${OUTDIR}/text.en.gz) \ + -da >(pigz -9 >${OUTDIR}/text.da.gz) \ + -nl >(pigz -9 >${OUTDIR}/text.nl.gz) \ + -fi >(pigz -9 >${OUTDIR}/text.fi.gz) \ + -fr >(pigz -9 >${OUTDIR}/text.fr.gz) \ + -de >(pigz -9 >${OUTDIR}/text.de.gz) \ + -iw >(pigz -9 >${OUTDIR}/text.iw.gz) \ + -it >(pigz -9 >${OUTDIR}/text.it.gz) \ + -ja >(pigz -9 >${OUTDIR}/text.ja.gz) \ + -ko >(pigz -9 >${OUTDIR}/text.ko.gz) \ + -no >(pigz -9 >${OUTDIR}/text.no.gz) \ + -pl >(pigz -9 >${OUTDIR}/text.pl.gz) \ + -pt >(pigz -9 >${OUTDIR}/text.pt.gz) \ + -ru >(pigz -9 >${OUTDIR}/text.ru.gz) \ + -es >(pigz -9 >${OUTDIR}/text.es.gz) \ + -sv >(pigz -9 >${OUTDIR}/text.sv.gz) \ + -zh >(pigz -9 >${OUTDIR}/text.zh.gz) \ + -cs >(pigz -9 >${OUTDIR}/text.cs.gz) \ + -el >(pigz -9 >${OUTDIR}/text.el.gz) \ + -is >(pigz -9 >${OUTDIR}/text.is.gz) \ + -lv >(pigz -9 >${OUTDIR}/text.lv.gz) \ + -lt >(pigz -9 >${OUTDIR}/text.lt.gz) \ + -ro >(pigz -9 >${OUTDIR}/text.ro.gz) \ + -hu >(pigz -9 >${OUTDIR}/text.hu.gz) \ + -et >(pigz -9 >${OUTDIR}/text.et.gz) \ + -xxx >(pigz -9 >${OUTDIR}/text.xxx.gz) \ + -un >(pigz -9 >${OUTDIR}/text.un.gz) \ + -bg >(pigz -9 >${OUTDIR}/text.bg.gz) \ + -hr >(pigz -9 >${OUTDIR}/text.hr.gz) \ + -sr >(pigz -9 >${OUTDIR}/text.sr.gz) \ + -ga >(pigz -9 >${OUTDIR}/text.ga.gz) \ + -gl >(pigz -9 >${OUTDIR}/text.gl.gz) \ + -tl >(pigz -9 >${OUTDIR}/text.tl.gz) \ + -tr >(pigz -9 >${OUTDIR}/text.tr.gz) \ + -uk >(pigz -9 >${OUTDIR}/text.uk.gz) \ + -hi >(pigz -9 >${OUTDIR}/text.hi.gz) \ + -mk >(pigz -9 >${OUTDIR}/text.mk.gz) \ + -bn >(pigz -9 >${OUTDIR}/text.bn.gz) \ + -id >(pigz -9 >${OUTDIR}/text.id.gz) \ + -la >(pigz -9 >${OUTDIR}/text.la.gz) \ + -ms >(pigz -9 >${OUTDIR}/text.ms.gz) \ + -ml >(pigz -9 >${OUTDIR}/text.ml.gz) \ + -cy >(pigz -9 >${OUTDIR}/text.cy.gz) \ + -ne >(pigz -9 >${OUTDIR}/text.ne.gz) \ + -te >(pigz -9 >${OUTDIR}/text.te.gz) \ + -sq >(pigz -9 >${OUTDIR}/text.sq.gz) \ + -ta >(pigz -9 >${OUTDIR}/text.ta.gz) \ + -be >(pigz -9 >${OUTDIR}/text.be.gz) \ + -jw >(pigz -9 >${OUTDIR}/text.jw.gz) \ + -oc >(pigz -9 >${OUTDIR}/text.oc.gz) \ + -ur >(pigz -9 >${OUTDIR}/text.ur.gz) \ + -bh >(pigz -9 >${OUTDIR}/text.bh.gz) \ + -gu >(pigz -9 >${OUTDIR}/text.gu.gz) \ + -th >(pigz -9 >${OUTDIR}/text.th.gz) \ + -ar >(pigz -9 >${OUTDIR}/text.ar.gz) \ + -ca >(pigz -9 >${OUTDIR}/text.ca.gz) \ + -eo >(pigz -9 >${OUTDIR}/text.eo.gz) \ + -eu >(pigz -9 >${OUTDIR}/text.eu.gz) \ + -ia >(pigz -9 >${OUTDIR}/text.ia.gz) \ + -kn >(pigz -9 >${OUTDIR}/text.kn.gz) \ + -pa >(pigz -9 >${OUTDIR}/text.pa.gz) \ + -gd >(pigz -9 >${OUTDIR}/text.gd.gz) \ + -sw >(pigz -9 >${OUTDIR}/text.sw.gz) \ + -sl >(pigz -9 >${OUTDIR}/text.sl.gz) \ + -mr >(pigz -9 >${OUTDIR}/text.mr.gz) \ + -mt >(pigz -9 >${OUTDIR}/text.mt.gz) \ + -vi >(pigz -9 >${OUTDIR}/text.vi.gz) \ + -fy >(pigz -9 >${OUTDIR}/text.fy.gz) \ + -sk >(pigz -9 >${OUTDIR}/text.sk.gz) \ + -zh_Hant >(pigz -9 >${OUTDIR}/text.zh-Hant.gz) \ + -fo >(pigz -9 >${OUTDIR}/text.fo.gz) \ + -su >(pigz -9 >${OUTDIR}/text.su.gz) \ + -uz >(pigz -9 >${OUTDIR}/text.uz.gz) \ + -am >(pigz -9 >${OUTDIR}/text.am.gz) \ + -az >(pigz -9 >${OUTDIR}/text.az.gz) \ + -ka >(pigz -9 >${OUTDIR}/text.ka.gz) \ + -ti >(pigz -9 >${OUTDIR}/text.ti.gz) \ + -fa >(pigz -9 >${OUTDIR}/text.fa.gz) \ + -bs >(pigz -9 >${OUTDIR}/text.bs.gz) \ + -si >(pigz -9 >${OUTDIR}/text.si.gz) \ + -nn >(pigz -9 >${OUTDIR}/text.nn.gz) \ + -xh >(pigz -9 >${OUTDIR}/text.xh.gz) \ + -zu >(pigz -9 >${OUTDIR}/text.zu.gz) \ + -gn >(pigz -9 >${OUTDIR}/text.gn.gz) \ + -st >(pigz -9 >${OUTDIR}/text.st.gz) \ + -tk >(pigz -9 >${OUTDIR}/text.tk.gz) \ + -ky >(pigz -9 >${OUTDIR}/text.ky.gz) \ + -br >(pigz -9 >${OUTDIR}/text.br.gz) \ + -tw >(pigz -9 >${OUTDIR}/text.tw.gz) \ + -yi >(pigz -9 >${OUTDIR}/text.yi.gz) \ + -so >(pigz -9 >${OUTDIR}/text.so.gz) \ + -ug >(pigz -9 >${OUTDIR}/text.ug.gz) \ + -ku >(pigz -9 >${OUTDIR}/text.ku.gz) \ + -mn >(pigz -9 >${OUTDIR}/text.mn.gz) \ + -hy >(pigz -9 >${OUTDIR}/text.hy.gz) \ + -lo >(pigz -9 >${OUTDIR}/text.lo.gz) \ + -sd >(pigz -9 >${OUTDIR}/text.sd.gz) \ + -rm >(pigz -9 >${OUTDIR}/text.rm.gz) \ + -af >(pigz -9 >${OUTDIR}/text.af.gz) \ + -lb >(pigz -9 >${OUTDIR}/text.lb.gz) \ + -my >(pigz -9 >${OUTDIR}/text.my.gz) \ + -km >(pigz -9 >${OUTDIR}/text.km.gz) \ + -bo >(pigz -9 >${OUTDIR}/text.bo.gz) \ + -dv >(pigz -9 >${OUTDIR}/text.dv.gz) \ + -chr >(pigz -9 >${OUTDIR}/text.chr.gz) \ + -syr >(pigz -9 >${OUTDIR}/text.syr.gz) \ + -lif >(pigz -9 >${OUTDIR}/text.lif.gz) \ + -or >(pigz -9 >${OUTDIR}/text.or.gz) \ + -as >(pigz -9 >${OUTDIR}/text.as.gz) \ + -co >(pigz -9 >${OUTDIR}/text.co.gz) \ + -ie >(pigz -9 >${OUTDIR}/text.ie.gz) \ + -kk >(pigz -9 >${OUTDIR}/text.kk.gz) \ + -ln >(pigz -9 >${OUTDIR}/text.ln.gz) \ + -mi >(pigz -9 >${OUTDIR}/text.mi.gz) \ + -wo >(pigz -9 >${OUTDIR}/text.wo.gz) \ + -ab >(pigz -9 >${OUTDIR}/text.ab.gz) \ + -aa >(pigz -9 >${OUTDIR}/text.aa.gz) \ + -ay >(pigz -9 >${OUTDIR}/text.ay.gz) \ + -ba >(pigz -9 >${OUTDIR}/text.ba.gz) \ + -bi >(pigz -9 >${OUTDIR}/text.bi.gz) \ + -dz >(pigz -9 >${OUTDIR}/text.dz.gz) \ + -fj >(pigz -9 >${OUTDIR}/text.fj.gz) \ + -kl >(pigz -9 >${OUTDIR}/text.kl.gz) \ + -ha >(pigz -9 >${OUTDIR}/text.ha.gz) \ + -ht >(pigz -9 >${OUTDIR}/text.ht.gz) \ + -ik >(pigz -9 >${OUTDIR}/text.ik.gz) \ + -iu >(pigz -9 >${OUTDIR}/text.iu.gz) \ + -ks >(pigz -9 >${OUTDIR}/text.ks.gz) \ + -rw >(pigz -9 >${OUTDIR}/text.rw.gz) \ + -mg >(pigz -9 >${OUTDIR}/text.mg.gz) \ + -na >(pigz -9 >${OUTDIR}/text.na.gz) \ + -om >(pigz -9 >${OUTDIR}/text.om.gz) \ + -rn >(pigz -9 >${OUTDIR}/text.rn.gz) \ + -sm >(pigz -9 >${OUTDIR}/text.sm.gz) \ + -sg >(pigz -9 >${OUTDIR}/text.sg.gz) \ + -sa >(pigz -9 >${OUTDIR}/text.sa.gz) \ + -ss >(pigz -9 >${OUTDIR}/text.ss.gz) \ + -ts >(pigz -9 >${OUTDIR}/text.ts.gz) \ + -tn >(pigz -9 >${OUTDIR}/text.tn.gz) \ + -vo >(pigz -9 >${OUTDIR}/text.vo.gz) \ + -za >(pigz -9 >${OUTDIR}/text.za.gz) \ + -kha >(pigz -9 >${OUTDIR}/text.kha.gz) \ + -sco >(pigz -9 >${OUTDIR}/text.sco.gz) \ + -lg >(pigz -9 >${OUTDIR}/text.lg.gz) \ + -gv >(pigz -9 >${OUTDIR}/text.gv.gz) \ + -sr_ME >(pigz -9 >${OUTDIR}/text.sr-ME.gz) \ + -ak >(pigz -9 >${OUTDIR}/text.ak.gz) \ + -ig >(pigz -9 >${OUTDIR}/text.ig.gz) \ + -mfe >(pigz -9 >${OUTDIR}/text.mfe.gz) \ + -haw >(pigz -9 >${OUTDIR}/text.haw.gz) \ + -ceb >(pigz -9 >${OUTDIR}/text.ceb.gz) \ + -ee >(pigz -9 >${OUTDIR}/text.ee.gz) \ + -gaa >(pigz -9 >${OUTDIR}/text.gaa.gz) \ + -blu >(pigz -9 >${OUTDIR}/text.blu.gz) \ + -kri >(pigz -9 >${OUTDIR}/text.kri.gz) \ + -loz >(pigz -9 >${OUTDIR}/text.loz.gz) \ + -lua >(pigz -9 >${OUTDIR}/text.lua.gz) \ + -luo >(pigz -9 >${OUTDIR}/text.luo.gz) \ + -new >(pigz -9 >${OUTDIR}/text.new.gz) \ + -ny >(pigz -9 >${OUTDIR}/text.ny.gz) \ + -os >(pigz -9 >${OUTDIR}/text.os.gz) \ + -pam >(pigz -9 >${OUTDIR}/text.pam.gz) \ + -nso >(pigz -9 >${OUTDIR}/text.nso.gz) \ + -raj >(pigz -9 >${OUTDIR}/text.raj.gz) \ + -crs >(pigz -9 >${OUTDIR}/text.crs.gz) \ + -tum >(pigz -9 >${OUTDIR}/text.tum.gz) \ + -ve >(pigz -9 >${OUTDIR}/text.ve.gz) \ + -war >(pigz -9 >${OUTDIR}/text.war.gz) \ + -nr >(pigz -9 >${OUTDIR}/text.nr.gz) \ + -zzb >(pigz -9 >${OUTDIR}/text.zzb.gz) \ + -zzp >(pigz -9 >${OUTDIR}/text.zzp.gz) \ + -zzh >(pigz -9 >${OUTDIR}/text.zzh.gz) \ + -tlh >(pigz -9 >${OUTDIR}/text.tlh.gz) \ + -zze >(pigz -9 >${OUTDIR}/text.zze.gz) \ + -xx_Zyyy >(pigz -9 >${OUTDIR}/text.xx-Zyyy.gz) \ + -xx_Latn >(pigz -9 >${OUTDIR}/text.xx-Latn.gz) \ + -xx_Grek >(pigz -9 >${OUTDIR}/text.xx-Grek.gz) \ + -xx_Cyrl >(pigz -9 >${OUTDIR}/text.xx-Cyrl.gz) \ + -xx_Armn >(pigz -9 >${OUTDIR}/text.xx-Armn.gz) \ + -xx_Hebr >(pigz -9 >${OUTDIR}/text.xx-Hebr.gz) \ + -xx_Arab >(pigz -9 >${OUTDIR}/text.xx-Arab.gz) \ + -xx_Syrc >(pigz -9 >${OUTDIR}/text.xx-Syrc.gz) \ + -xx_Thaa >(pigz -9 >${OUTDIR}/text.xx-Thaa.gz) \ + -xx_Deva >(pigz -9 >${OUTDIR}/text.xx-Deva.gz) \ + -xx_Beng >(pigz -9 >${OUTDIR}/text.xx-Beng.gz) \ + -xx_Guru >(pigz -9 >${OUTDIR}/text.xx-Guru.gz) \ + -xx_Gujr >(pigz -9 >${OUTDIR}/text.xx-Gujr.gz) \ + -xx_Orya >(pigz -9 >${OUTDIR}/text.xx-Orya.gz) \ + -xx_Taml >(pigz -9 >${OUTDIR}/text.xx-Taml.gz) \ + -xx_Telu >(pigz -9 >${OUTDIR}/text.xx-Telu.gz) \ + -xx_Knda >(pigz -9 >${OUTDIR}/text.xx-Knda.gz) \ + -xx_Mlym >(pigz -9 >${OUTDIR}/text.xx-Mlym.gz) \ + -xx_Sinh >(pigz -9 >${OUTDIR}/text.xx-Sinh.gz) \ + -xx_Thai >(pigz -9 >${OUTDIR}/text.xx-Thai.gz) \ + -xx_Laoo >(pigz -9 >${OUTDIR}/text.xx-Laoo.gz) \ + -xx_Tibt >(pigz -9 >${OUTDIR}/text.xx-Tibt.gz) \ + -xx_Mymr >(pigz -9 >${OUTDIR}/text.xx-Mymr.gz) \ + -xx_Geor >(pigz -9 >${OUTDIR}/text.xx-Geor.gz) \ + -xx_Hang >(pigz -9 >${OUTDIR}/text.xx-Hang.gz) \ + -xx_Ethi >(pigz -9 >${OUTDIR}/text.xx-Ethi.gz) \ + -xx_Cher >(pigz -9 >${OUTDIR}/text.xx-Cher.gz) \ + -xx_Cans >(pigz -9 >${OUTDIR}/text.xx-Cans.gz) \ + -xx_Ogam >(pigz -9 >${OUTDIR}/text.xx-Ogam.gz) \ + -xx_Runr >(pigz -9 >${OUTDIR}/text.xx-Runr.gz) \ + -xx_Khmr >(pigz -9 >${OUTDIR}/text.xx-Khmr.gz) \ + -xx_Mong >(pigz -9 >${OUTDIR}/text.xx-Mong.gz) \ + -xx_Hira >(pigz -9 >${OUTDIR}/text.xx-Hira.gz) \ + -xx_Kana >(pigz -9 >${OUTDIR}/text.xx-Kana.gz) \ + -xx_Bopo >(pigz -9 >${OUTDIR}/text.xx-Bopo.gz) \ + -xx_Hani >(pigz -9 >${OUTDIR}/text.xx-Hani.gz) \ + -xx_Yiii >(pigz -9 >${OUTDIR}/text.xx-Yiii.gz) \ + -xx_Ital >(pigz -9 >${OUTDIR}/text.xx-Ital.gz) \ + -xx_Goth >(pigz -9 >${OUTDIR}/text.xx-Goth.gz) \ + -xx_Dsrt >(pigz -9 >${OUTDIR}/text.xx-Dsrt.gz) \ + -xx_Qaai >(pigz -9 >${OUTDIR}/text.xx-Qaai.gz) \ + -xx_Tglg >(pigz -9 >${OUTDIR}/text.xx-Tglg.gz) \ + -xx_Hano >(pigz -9 >${OUTDIR}/text.xx-Hano.gz) \ + -xx_Buhd >(pigz -9 >${OUTDIR}/text.xx-Buhd.gz) \ + -xx_Tagb >(pigz -9 >${OUTDIR}/text.xx-Tagb.gz) \ + -xx_Limb >(pigz -9 >${OUTDIR}/text.xx-Limb.gz) \ + -xx_Tale >(pigz -9 >${OUTDIR}/text.xx-Tale.gz) \ + -xx_Linb >(pigz -9 >${OUTDIR}/text.xx-Linb.gz) \ + -xx_Ugar >(pigz -9 >${OUTDIR}/text.xx-Ugar.gz) \ + -xx_Shaw >(pigz -9 >${OUTDIR}/text.xx-Shaw.gz) \ + -xx_Osma >(pigz -9 >${OUTDIR}/text.xx-Osma.gz) \ + -xx_Cprt >(pigz -9 >${OUTDIR}/text.xx-Cprt.gz) \ + -xx_Brai >(pigz -9 >${OUTDIR}/text.xx-Brai.gz) \ + -xx_Bugi >(pigz -9 >${OUTDIR}/text.xx-Bugi.gz) \ + -xx_Copt >(pigz -9 >${OUTDIR}/text.xx-Copt.gz) \ + -xx_Talu >(pigz -9 >${OUTDIR}/text.xx-Talu.gz) \ + -xx_Glag >(pigz -9 >${OUTDIR}/text.xx-Glag.gz) \ + -xx_Tfng >(pigz -9 >${OUTDIR}/text.xx-Tfng.gz) \ + -xx_Sylo >(pigz -9 >${OUTDIR}/text.xx-Sylo.gz) \ + -xx_Xpeo >(pigz -9 >${OUTDIR}/text.xx-Xpeo.gz) \ + -xx_Khar >(pigz -9 >${OUTDIR}/text.xx-Khar.gz) \ + -xx_Bali >(pigz -9 >${OUTDIR}/text.xx-Bali.gz) \ + -xx_Xsux >(pigz -9 >${OUTDIR}/text.xx-Xsux.gz) \ + -xx_Phnx >(pigz -9 >${OUTDIR}/text.xx-Phnx.gz) \ + -xx_Phag >(pigz -9 >${OUTDIR}/text.xx-Phag.gz) \ + -xx_Nkoo >(pigz -9 >${OUTDIR}/text.xx-Nkoo.gz) \ + -xx_Sund >(pigz -9 >${OUTDIR}/text.xx-Sund.gz) \ + -xx_Lepc >(pigz -9 >${OUTDIR}/text.xx-Lepc.gz) \ + -xx_Olck >(pigz -9 >${OUTDIR}/text.xx-Olck.gz) \ + -xx_Vaii >(pigz -9 >${OUTDIR}/text.xx-Vaii.gz) \ + -xx_Saur >(pigz -9 >${OUTDIR}/text.xx-Saur.gz) \ + -xx_Kali >(pigz -9 >${OUTDIR}/text.xx-Kali.gz) \ + -xx_Rjng >(pigz -9 >${OUTDIR}/text.xx-Rjng.gz) \ + -xx_Lyci >(pigz -9 >${OUTDIR}/text.xx-Lyci.gz) \ + -xx_Cari >(pigz -9 >${OUTDIR}/text.xx-Cari.gz) \ + -xx_Lydi >(pigz -9 >${OUTDIR}/text.xx-Lydi.gz) \ + -xx_Cham >(pigz -9 >${OUTDIR}/text.xx-Cham.gz) \ + -xx_Lana >(pigz -9 >${OUTDIR}/text.xx-Lana.gz) \ + -xx_Tavt >(pigz -9 >${OUTDIR}/text.xx-Tavt.gz) \ + -xx_Avst >(pigz -9 >${OUTDIR}/text.xx-Avst.gz) \ + -xx_Egyp >(pigz -9 >${OUTDIR}/text.xx-Egyp.gz) \ + -xx_Samr >(pigz -9 >${OUTDIR}/text.xx-Samr.gz) \ + -xx_Lisu >(pigz -9 >${OUTDIR}/text.xx-Lisu.gz) \ + -xx_Bamu >(pigz -9 >${OUTDIR}/text.xx-Bamu.gz) \ + -xx_Java >(pigz -9 >${OUTDIR}/text.xx-Java.gz) \ + -xx_Mtei >(pigz -9 >${OUTDIR}/text.xx-Mtei.gz) \ + -xx_Armi >(pigz -9 >${OUTDIR}/text.xx-Armi.gz) \ + -xx_Sarb >(pigz -9 >${OUTDIR}/text.xx-Sarb.gz) \ + -xx_Prti >(pigz -9 >${OUTDIR}/text.xx-Prti.gz) \ + -xx_Phli >(pigz -9 >${OUTDIR}/text.xx-Phli.gz) \ + -xx_Orkh >(pigz -9 >${OUTDIR}/text.xx-Orkh.gz) \ + -xx_Kthi >(pigz -9 >${OUTDIR}/text.xx-Kthi.gz) \ + -xx_Batk >(pigz -9 >${OUTDIR}/text.xx-Batk.gz) \ + -xx_Brah >(pigz -9 >${OUTDIR}/text.xx-Brah.gz) \ + -xx_Mand >(pigz -9 >${OUTDIR}/text.xx-Mand.gz) \ + -xx_Cakm >(pigz -9 >${OUTDIR}/text.xx-Cakm.gz) \ + -xx_Merc >(pigz -9 >${OUTDIR}/text.xx-Merc.gz) \ + -xx_Mero >(pigz -9 >${OUTDIR}/text.xx-Mero.gz) \ + -xx_Plrd >(pigz -9 >${OUTDIR}/text.xx-Plrd.gz) \ + -xx_Shrd >(pigz -9 >${OUTDIR}/text.xx-Shrd.gz) \ + -xx_Sora >(pigz -9 >${OUTDIR}/text.xx-Sora.gz) \ + -xx_Takr >(pigz -9 >${OUTDIR}/text.xx-Takr.gz) + touch ${DONEFILE} +fi diff --git a/raw/create_raw.sh b/raw/create_raw.sh new file mode 100755 index 0000000..4ce8acb --- /dev/null +++ b/raw/create_raw.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -e +set -o pipefail + +# Parallel command: +# cat langugage.codes | parallel --nice 19 --progress --sshloginfile file \ +# create_raw.sh $crawl_dir $out_dir {} + +INDIR=$1 +OUTDIR=$2 +LANGCODE=$3 + +OUTFILE=${OUTDIR}/${LANGCODE}.raw.xz +DONEFILE=${OUTFILE}.done + +unsafe_gunzip() { + # unsafe_gunzip makes it possible to open several .gz files which are corrupted. + # In our case many .gz files fail with a an "Unexpected end of file" error. + set +e + set +o pipefail + for file in "$@"; do + gzip -cd "$file" 2> /dev/null + echo + done + set -o pipefail + set -e +} + +if [[ -f ${DONEFILE} ]]; then + exit 0 +fi + +unsafe_gunzip ${INDIR}/*/text.${LANGCODE}.gz | xz -c > "${OUTFILE}" + +touch "${DONEFILE}" diff --git a/raw/create_raw_en.sh b/raw/create_raw_en.sh new file mode 100755 index 0000000..3fa856d --- /dev/null +++ b/raw/create_raw_en.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +set -e +set -o pipefail + +SOURCEFILE=$1 +DESTINATION=$2 +INDEX=$3 +PADDEDINDEX=$(printf %02d $((INDEX - 1))) +NEWFILE=${DESTINATION}/en.${PADDEDINDEX}.raw.xz +DONEFILE=${NEWFILE}.done +if [[ -f ${DONEFILE} ]]; then + exit 0 +fi + +unsafe_gunzip() { + # unsafe_gunzip makes it possible to open several .gz files which are corrupted. + # In our case many .gz files fail with a an "Unexpected end of file" error. + set +e + set +o pipefail + for file in "$@"; do + gzip -cd "$file" 2> /dev/null + echo + done + set -o pipefail + set -e +} + +unsafe_gunzip "${SOURCEFILE}" | xz -c > "${NEWFILE}" + +touch "${DONEFILE}" diff --git a/s3/FILES.md b/s3/FILES.md new file mode 100644 index 0000000..1a2ff0d --- /dev/null +++ b/s3/FILES.md @@ -0,0 +1,14 @@ +# S3 File Structure + +In general we have a seperate directory for each language, which in turn contains up to three subdirectories: +``` +s3://web-language-models/ngrams/${lang}/deduped +s3://web-language-models/ngrams/${lang}/raw +s3://web-language-models/ngrams/${lang}/lm +``` + +The deduped folder contains the deduped file of that language with the corresponding offset file. The raw folder contains the `.raw` files for each individual crawl. The lm fodler contains the language model, if there exists one. + +## Irregularities + +The English language model is located at `s3://web-language-models/ngrams/lm/en.trie.xz` which is an artifact of the old file structure. All attempts to copy the language model on AWS failed due to the size of the model. We might need to reupload it if we want to change its location. diff --git a/s3/check_deduped_en.sh b/s3/check_deduped_en.sh new file mode 100755 index 0000000..56b5522 --- /dev/null +++ b/s3/check_deduped_en.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e +set -o pipefail + +for i in $(seq -f "%02g" 0 99); do + SOURCEFILE="s3://web-language-models/ngrams/deduped/en/en.${i}.deduped.xz" + TARGETFILE="s3://web-language-models/ngrams/en/deduped/en.${i}.deduped.xz" + SOURCESIZE=$(s3cmd ls ${SOURCEFILE} | cut -d ' ' -f 3) + TARGETSIZE=$(s3cmd ls ${TARGETFILE} | cut -d ' ' -f 3) + if [[ ! ${SOURCESIZE} -eq ${TARGETSIZE} ]]; then + echo "Mismatch on file ${SOURCEFILE}" + fi +done diff --git a/s3/check_raw_uploads.sh b/s3/check_raw_uploads.sh new file mode 100755 index 0000000..b32d576 --- /dev/null +++ b/s3/check_raw_uploads.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e +set -o pipefail + +FILE=$(echo $1 | awk ' BEGIN { FS = "/" } { print $(NF) }') + +BUCKET="s3://web-language-models/ngrams/en/raw/${FILE}" + +MD5SUM1=$(echo $(/home/tim/bin/s3cmd/s3cmd ls --list-md5 ${BUCKET}) | awk 'BEGIN { FS = " " } {print $(NF-1)}') +MD5SUM2=$(md5sum $1 | awk 'BEGIN { FS = " " } {print $1}') +if [ ${MD5SUM1} != ${MD5SUM2} ]; then + echo "$1" +fi diff --git a/s3/check_raw_uploads_en.sh b/s3/check_raw_uploads_en.sh new file mode 100644 index 0000000..ef7029a --- /dev/null +++ b/s3/check_raw_uploads_en.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +FILE=$(echo "$1" | awk ' BEGIN { FS = "/" } { print $(NF) }') +OUTFILE="$2" + +MD5SUM=$(md5sum "$1" | awk 'BEGIN { FS = " " } {print $1}') +echo "${MD5SUM} ${FILE}" >> "${OUTFILE}" diff --git a/s3/continue_multipart.sh b/s3/continue_multipart.sh new file mode 100755 index 0000000..50448c8 --- /dev/null +++ b/s3/continue_multipart.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e +set -o pipefail + +s3cmd multipart s3://web-language-models/ngrams/raw_en/ | \ + grep -o "en.[^/]*.xz" | \ + parallel --nice 19 --progress -j 8 s3cmd put --continue-put {} s3://web-language-models/ngrams/en/raw/{} diff --git a/s3/copy_deduped.py b/s3/copy_deduped.py new file mode 100755 index 0000000..14f9158 --- /dev/null +++ b/s3/copy_deduped.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +import sys +import boto3 +from boto3.s3.transfer import TransferConfig + +for line in sys.stdin: + link = line.split()[-1] + if ".xz" in link: + lang = link.split('/')[-1].split('.')[0] + sourcekey = "ngrams/deduped/{lang}.deduped.xz".format(lang=lang) + targetkey = "ngrams/{lang}/deduped/{lang}.deduped.xz".format(lang=lang) + + print("Copy from {} to {}..".format(sourcekey, targetkey)) + chunksize = 1000 * 1000000 + transferConfig = TransferConfig(multipart_threshold=chunksize, multipart_chunksize=chunksize) + + s3 = boto3.resource('s3') + copy_source = { + 'Bucket': 'web-language-models', + 'Key': sourcekey + } + s3.meta.client.copy(copy_source, 'web-language-models', targetkey, Config=transferConfig) + diff --git a/s3/copy_deduped_en.py b/s3/copy_deduped_en.py new file mode 100755 index 0000000..e7d7ea6 --- /dev/null +++ b/s3/copy_deduped_en.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +import sys +import boto3 +from boto3.s3.transfer import TransferConfig + +for line in sys.stdin: + link = line.split()[-1] + if ".xz" in link: + index = link.split('/')[-1].split('.')[1] + sourcekey = "ngrams/deduped/en/en.{index}.deduped.xz".format(index=index) + targetkey = "ngrams/en/deduped/en.{index}.deduped.xz".format(index=index) + + print("Copy from {} to {}..".format(sourcekey, targetkey)) + chunksize = 1000 * 1000000 + transferConfig = TransferConfig(multipart_threshold=chunksize, multipart_chunksize=chunksize) + + s3 = boto3.resource('s3') + copy_source = { + 'Bucket': 'web-language-models', + 'Key': sourcekey + } + s3.meta.client.copy(copy_source, 'web-language-models', targetkey, Config=transferConfig) + diff --git a/s3/count_uploads.sh b/s3/count_uploads.sh new file mode 100755 index 0000000..02b4a28 --- /dev/null +++ b/s3/count_uploads.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +TOTAL=0 +UPLOADED=0 +echo -n "${TOTAL}" +for filepath in /fs/vali0/www/data.statmt.org/ngrams/raw/*.xz; do + TOTAL=$((TOTAL+1)) + echo -en "\e[1A"; echo -e "\e[0K\r ${TOTAL}" + FILE=$(echo $filepath | awk ' BEGIN { FS = "/" } { print $(NF) }') + LANGUAGE=$(echo ${FILE} | awk ' BEGIN { FS = "." } { print $1 }') + YEAR=$(echo ${FILE} | awk ' BEGIN {FS = "." } { print $2 }') + VERSION="00" + + NEW_FILENAME=$(echo ${FILE} | sed "s/[0-9_]\{1,\}/${YEAR}.${VERSION}/") + BUCKET="s3://web-language-models/ngrams/${LANGUAGE}/raw/${NEW_FILENAME}" + + # Increase counter if file already exists. + if [[ ! -z $(/home/tim/bin/s3cmd/s3cmd ls ${BUCKET}) ]]; then + UPLOADED=$((UPLOADED+1)) + fi +done + +echo -en "\e[1A"; echo -e "\e[0K\r ${UPLOADED}/${TOTAL}" diff --git a/s3/create_md5_sums.sh b/s3/create_md5_sums.sh new file mode 100755 index 0000000..8fb0f20 --- /dev/null +++ b/s3/create_md5_sums.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +FILE=$(echo "$1" | awk ' BEGIN { FS = "/" } { print $(NF) }') +OUTFILE="$2" + +MD5SUM=$(md5sum $1 | awk 'BEGIN { FS = " " } {print $1}') +echo "${MD5SUM} ${FILE}" >> "${OUTFILE}" diff --git a/s3/index_files/index_lm.html b/s3/index_files/index_lm.html new file mode 100644 index 0000000..4fddcb1 --- /dev/null +++ b/s3/index_files/index_lm.html @@ -0,0 +1,18 @@ + + + + Language Models + + + +

Language Models

+ + + diff --git a/s3/index_files/index_raw.html b/s3/index_files/index_raw.html new file mode 100644 index 0000000..6464843 --- /dev/null +++ b/s3/index_files/index_raw.html @@ -0,0 +1,926 @@ + + + + Raw CommonCrawl + + + +

Raw CommonCrawl

+ + + diff --git a/s3/index_files/raw_links b/s3/index_files/raw_links new file mode 100644 index 0000000..2aaae22 --- /dev/null +++ b/s3/index_files/raw_links @@ -0,0 +1,914 @@ +
  • aa.2012.00.raw.xz
  • +
  • aa.2013_1.00.raw.xz
  • +
  • aa.2013_2.00.raw.xz
  • +
  • aa.2014_1.00.raw.xz
  • +
  • ab.2012.00.raw.xz
  • +
  • ab.2013_1.00.raw.xz
  • +
  • ab.2013_2.00.raw.xz
  • +
  • ab.2014_1.00.raw.xz
  • +
  • af.2012.00.raw.xz
  • +
  • af.2013_1.00.raw.xz
  • +
  • af.2013_2.00.raw.xz
  • +
  • af.2014_1.00.raw.xz
  • +
  • ak.2012.00.raw.xz
  • +
  • ak.2013_1.00.raw.xz
  • +
  • ak.2013_2.00.raw.xz
  • +
  • ak.2014_1.00.raw.xz
  • +
  • am.2012.00.raw.xz
  • +
  • am.2013_1.00.raw.xz
  • +
  • am.2013_2.00.raw.xz
  • +
  • am.2014_1.00.raw.xz
  • +
  • ar.2012.00.raw.xz
  • +
  • ar.2013_1.00.raw.xz
  • +
  • ar.2013_2.00.raw.xz
  • +
  • ar.2014_1.00.raw.xz
  • +
  • as.2012.00.raw.xz
  • +
  • as.2013_1.00.raw.xz
  • +
  • as.2013_2.00.raw.xz
  • +
  • as.2014_1.00.raw.xz
  • +
  • ay.2012.00.raw.xz
  • +
  • ay.2013_1.00.raw.xz
  • +
  • ay.2013_2.00.raw.xz
  • +
  • ay.2014_1.00.raw.xz
  • +
  • az.2012.00.raw.xz
  • +
  • az.2013_1.00.raw.xz
  • +
  • az.2013_2.00.raw.xz
  • +
  • az.2014_1.00.raw.xz
  • +
  • ba.2012.00.raw.xz
  • +
  • ba.2013_1.00.raw.xz
  • +
  • ba.2013_2.00.raw.xz
  • +
  • ba.2014_1.00.raw.xz
  • +
  • be.2012.00.raw.xz
  • +
  • be.2013_1.00.raw.xz
  • +
  • be.2013_2.00.raw.xz
  • +
  • be.2014_1.00.raw.xz
  • +
  • bg.2012.00.raw.xz
  • +
  • bg.2013_1.00.raw.xz
  • +
  • bg.2013_2.00.raw.xz
  • +
  • bg.2014_1.00.raw.xz
  • +
  • bh.2012.00.raw.xz
  • +
  • bh.2013_1.00.raw.xz
  • +
  • bh.2013_2.00.raw.xz
  • +
  • bh.2014_1.00.raw.xz
  • +
  • bi.2012.00.raw.xz
  • +
  • bi.2013_1.00.raw.xz
  • +
  • bi.2013_2.00.raw.xz
  • +
  • bi.2014_1.00.raw.xz
  • +
  • blu.2012.00.raw.xz
  • +
  • blu.2013_1.00.raw.xz
  • +
  • blu.2013_2.00.raw.xz
  • +
  • blu.2014_1.00.raw.xz
  • +
  • bn.2012.00.raw.xz
  • +
  • bn.2013_1.00.raw.xz
  • +
  • bn.2013_2.00.raw.xz
  • +
  • bn.2014_1.00.raw.xz
  • +
  • bo.2012.00.raw.xz
  • +
  • bo.2013_1.00.raw.xz
  • +
  • bo.2013_2.00.raw.xz
  • +
  • bo.2014_1.00.raw.xz
  • +
  • br.2012.00.raw.xz
  • +
  • br.2013_1.00.raw.xz
  • +
  • br.2013_2.00.raw.xz
  • +
  • br.2014_1.00.raw.xz
  • +
  • bs.2012.00.raw.xz
  • +
  • bs.2013_1.00.raw.xz
  • +
  • bs.2013_2.00.raw.xz
  • +
  • bs.2014_1.00.raw.xz
  • +
  • ca.2012.00.raw.xz
  • +
  • ca.2013_1.00.raw.xz
  • +
  • ca.2013_2.00.raw.xz
  • +
  • ca.2014_1.00.raw.xz
  • +
  • ceb.2012.00.raw.xz
  • +
  • ceb.2013_1.00.raw.xz
  • +
  • ceb.2013_2.00.raw.xz
  • +
  • ceb.2014_1.00.raw.xz
  • +
  • chr.2012.00.raw.xz
  • +
  • chr.2013_1.00.raw.xz
  • +
  • chr.2013_2.00.raw.xz
  • +
  • chr.2014_1.00.raw.xz
  • +
  • co.2012.00.raw.xz
  • +
  • co.2013_1.00.raw.xz
  • +
  • co.2013_2.00.raw.xz
  • +
  • co.2014_1.00.raw.xz
  • +
  • crs.2012.00.raw.xz
  • +
  • crs.2013_1.00.raw.xz
  • +
  • crs.2013_2.00.raw.xz
  • +
  • crs.2014_1.00.raw.xz
  • +
  • cs.2012.00.raw.xz
  • +
  • cs.2013_1.00.raw.xz
  • +
  • cs.2013_2.00.raw.xz
  • +
  • cs.2014_1.00.raw.xz
  • +
  • cy.2012.00.raw.xz
  • +
  • cy.2013_1.00.raw.xz
  • +
  • cy.2013_2.00.raw.xz
  • +
  • cy.2014_1.00.raw.xz
  • +
  • da.2012.00.raw.xz
  • +
  • da.2013_1.00.raw.xz
  • +
  • da.2013_2.00.raw.xz
  • +
  • da.2014_1.00.raw.xz
  • +
  • de.2012.00.raw.xz
  • +
  • de.2013_1.00.raw.xz
  • +
  • de.2013_2.00.raw.xz
  • +
  • de.2014_1.00.raw.xz
  • +
  • dv.2012.00.raw.xz
  • +
  • dv.2013_1.00.raw.xz
  • +
  • dv.2013_2.00.raw.xz
  • +
  • dv.2014_1.00.raw.xz
  • +
  • dz.2012.00.raw.xz
  • +
  • dz.2013_1.00.raw.xz
  • +
  • dz.2013_2.00.raw.xz
  • +
  • dz.2014_1.00.raw.xz
  • +
  • el.2012.00.raw.xz
  • +
  • el.2013_1.00.raw.xz
  • +
  • el.2013_2.00.raw.xz
  • +
  • el.2014_1.00.raw.xz
  • +
  • eo.2012.00.raw.xz
  • +
  • eo.2013_1.00.raw.xz
  • +
  • eo.2013_2.00.raw.xz
  • +
  • eo.2014_1.00.raw.xz
  • +
  • es.2012.00.raw.xz
  • +
  • es.2013_1.00.raw.xz
  • +
  • es.2013_2.00.raw.xz
  • +
  • es.2014_1.00.raw.xz
  • +
  • et.2012.00.raw.xz
  • +
  • et.2013_1.00.raw.xz
  • +
  • et.2013_2.00.raw.xz
  • +
  • et.2014_1.00.raw.xz
  • +
  • eu.2012.00.raw.xz
  • +
  • eu.2013_1.00.raw.xz
  • +
  • eu.2013_2.00.raw.xz
  • +
  • eu.2014_1.00.raw.xz
  • +
  • fa.2012.00.raw.xz
  • +
  • fa.2013_1.00.raw.xz
  • +
  • fa.2013_2.00.raw.xz
  • +
  • fa.2014_1.00.raw.xz
  • +
  • fi.2012.00.raw.xz
  • +
  • fi.2013_1.00.raw.xz
  • +
  • fi.2013_2.00.raw.xz
  • +
  • fi.2014_1.00.raw.xz
  • +
  • fj.2012.00.raw.xz
  • +
  • fj.2013_1.00.raw.xz
  • +
  • fj.2013_2.00.raw.xz
  • +
  • fj.2014_1.00.raw.xz
  • +
  • fo.2012.00.raw.xz
  • +
  • fo.2013_1.00.raw.xz
  • +
  • fo.2013_2.00.raw.xz
  • +
  • fo.2014_1.00.raw.xz
  • +
  • fr.2012.00.raw.xz
  • +
  • fr.2013_1.00.raw.xz
  • +
  • fr.2013_2.00.raw.xz
  • +
  • fr.2014_1.00.raw.xz
  • +
  • fy.2012.00.raw.xz
  • +
  • fy.2013_1.00.raw.xz
  • +
  • fy.2013_2.00.raw.xz
  • +
  • fy.2014_1.00.raw.xz
  • +
  • ga.2012.00.raw.xz
  • +
  • ga.2013_1.00.raw.xz
  • +
  • ga.2013_2.00.raw.xz
  • +
  • ga.2014_1.00.raw.xz
  • +
  • gd.2012.00.raw.xz
  • +
  • gd.2013_1.00.raw.xz
  • +
  • gd.2013_2.00.raw.xz
  • +
  • gd.2014_1.00.raw.xz
  • +
  • gl.2012.00.raw.xz
  • +
  • gl.2013_1.00.raw.xz
  • +
  • gl.2013_2.00.raw.xz
  • +
  • gl.2014_1.00.raw.xz
  • +
  • gn.2012.00.raw.xz
  • +
  • gn.2013_1.00.raw.xz
  • +
  • gn.2013_2.00.raw.xz
  • +
  • gn.2014_1.00.raw.xz
  • +
  • gu.2012.00.raw.xz
  • +
  • gu.2013_1.00.raw.xz
  • +
  • gu.2013_2.00.raw.xz
  • +
  • gu.2014_1.00.raw.xz
  • +
  • gv.2012.00.raw.xz
  • +
  • gv.2013_1.00.raw.xz
  • +
  • gv.2013_2.00.raw.xz
  • +
  • gv.2014_1.00.raw.xz
  • +
  • ha.2012.00.raw.xz
  • +
  • ha.2013_1.00.raw.xz
  • +
  • ha.2013_2.00.raw.xz
  • +
  • ha.2014_1.00.raw.xz
  • +
  • haw.2012.00.raw.xz
  • +
  • haw.2013_1.00.raw.xz
  • +
  • haw.2013_2.00.raw.xz
  • +
  • haw.2014_1.00.raw.xz
  • +
  • hi.2012.00.raw.xz
  • +
  • hi.2013_1.00.raw.xz
  • +
  • hi.2013_2.00.raw.xz
  • +
  • hi.2014_1.00.raw.xz
  • +
  • hr.2012.00.raw.xz
  • +
  • hr.2013_1.00.raw.xz
  • +
  • hr.2013_2.00.raw.xz
  • +
  • hr.2014_1.00.raw.xz
  • +
  • ht.2012.00.raw.xz
  • +
  • ht.2013_1.00.raw.xz
  • +
  • ht.2013_2.00.raw.xz
  • +
  • ht.2014_1.00.raw.xz
  • +
  • hu.2012.00.raw.xz
  • +
  • hu.2013_1.00.raw.xz
  • +
  • hu.2013_2.00.raw.xz
  • +
  • hu.2014_1.00.raw.xz
  • +
  • hy.2012.00.raw.xz
  • +
  • hy.2013_1.00.raw.xz
  • +
  • hy.2013_2.00.raw.xz
  • +
  • hy.2014_1.00.raw.xz
  • +
  • ia.2012.00.raw.xz
  • +
  • ia.2013_1.00.raw.xz
  • +
  • ia.2013_2.00.raw.xz
  • +
  • ia.2014_1.00.raw.xz
  • +
  • id.2012.00.raw.xz
  • +
  • id.2013_1.00.raw.xz
  • +
  • id.2013_2.00.raw.xz
  • +
  • id.2014_1.00.raw.xz
  • +
  • ie.2012.00.raw.xz
  • +
  • ie.2013_1.00.raw.xz
  • +
  • ie.2013_2.00.raw.xz
  • +
  • ie.2014_1.00.raw.xz
  • +
  • ig.2012.00.raw.xz
  • +
  • ig.2013_1.00.raw.xz
  • +
  • ig.2013_2.00.raw.xz
  • +
  • ig.2014_1.00.raw.xz
  • +
  • ik.2012.00.raw.xz
  • +
  • ik.2013_1.00.raw.xz
  • +
  • ik.2013_2.00.raw.xz
  • +
  • ik.2014_1.00.raw.xz
  • +
  • is.2012.00.raw.xz
  • +
  • is.2013_1.00.raw.xz
  • +
  • is.2013_2.00.raw.xz
  • +
  • is.2014_1.00.raw.xz
  • +
  • it.2012.00.raw.xz
  • +
  • it.2013_1.00.raw.xz
  • +
  • it.2013_2.00.raw.xz
  • +
  • it.2014_1.00.raw.xz
  • +
  • iu.2012.00.raw.xz
  • +
  • iu.2013_1.00.raw.xz
  • +
  • iu.2013_2.00.raw.xz
  • +
  • iu.2014_1.00.raw.xz
  • +
  • iw.2012.00.raw.xz
  • +
  • iw.2013_1.00.raw.xz
  • +
  • iw.2013_2.00.raw.xz
  • +
  • iw.2014_1.00.raw.xz
  • +
  • ja.2012.00.raw.xz
  • +
  • ja.2013_1.00.raw.xz
  • +
  • ja.2013_2.00.raw.xz
  • +
  • ja.2014_1.00.raw.xz
  • +
  • jw.2012.00.raw.xz
  • +
  • jw.2013_1.00.raw.xz
  • +
  • jw.2013_2.00.raw.xz
  • +
  • jw.2014_1.00.raw.xz
  • +
  • ka.2012.00.raw.xz
  • +
  • ka.2013_1.00.raw.xz
  • +
  • ka.2013_2.00.raw.xz
  • +
  • ka.2014_1.00.raw.xz
  • +
  • kha.2012.00.raw.xz
  • +
  • kha.2013_1.00.raw.xz
  • +
  • kha.2013_2.00.raw.xz
  • +
  • kha.2014_1.00.raw.xz
  • +
  • kk.2012.00.raw.xz
  • +
  • kk.2013_1.00.raw.xz
  • +
  • kk.2013_2.00.raw.xz
  • +
  • kk.2014_1.00.raw.xz
  • +
  • kl.2012.00.raw.xz
  • +
  • kl.2013_1.00.raw.xz
  • +
  • kl.2013_2.00.raw.xz
  • +
  • kl.2014_1.00.raw.xz
  • +
  • km.2012.00.raw.xz
  • +
  • km.2013_1.00.raw.xz
  • +
  • km.2013_2.00.raw.xz
  • +
  • km.2014_1.00.raw.xz
  • +
  • kn.2012.00.raw.xz
  • +
  • kn.2013_1.00.raw.xz
  • +
  • kn.2013_2.00.raw.xz
  • +
  • kn.2014_1.00.raw.xz
  • +
  • ko.2012.00.raw.xz
  • +
  • ko.2013_1.00.raw.xz
  • +
  • ko.2013_2.00.raw.xz
  • +
  • ko.2014_1.00.raw.xz
  • +
  • ks.2012.00.raw.xz
  • +
  • ks.2013_1.00.raw.xz
  • +
  • ks.2013_2.00.raw.xz
  • +
  • ks.2014_1.00.raw.xz
  • +
  • ku.2012.00.raw.xz
  • +
  • ku.2013_1.00.raw.xz
  • +
  • ku.2013_2.00.raw.xz
  • +
  • ku.2014_1.00.raw.xz
  • +
  • ky.2012.00.raw.xz
  • +
  • ky.2013_1.00.raw.xz
  • +
  • ky.2013_2.00.raw.xz
  • +
  • ky.2014_1.00.raw.xz
  • +
  • la.2012.00.raw.xz
  • +
  • la.2013_1.00.raw.xz
  • +
  • la.2013_2.00.raw.xz
  • +
  • la.2014_1.00.raw.xz
  • +
  • lb.2012.00.raw.xz
  • +
  • lb.2013_1.00.raw.xz
  • +
  • lb.2013_2.00.raw.xz
  • +
  • lb.2014_1.00.raw.xz
  • +
  • lg.2012.00.raw.xz
  • +
  • lg.2013_1.00.raw.xz
  • +
  • lg.2013_2.00.raw.xz
  • +
  • lg.2014_1.00.raw.xz
  • +
  • lif.2012.00.raw.xz
  • +
  • lif.2013_1.00.raw.xz
  • +
  • lif.2013_2.00.raw.xz
  • +
  • lif.2014_1.00.raw.xz
  • +
  • ln.2012.00.raw.xz
  • +
  • ln.2013_1.00.raw.xz
  • +
  • ln.2013_2.00.raw.xz
  • +
  • ln.2014_1.00.raw.xz
  • +
  • lo.2012.00.raw.xz
  • +
  • lo.2013_1.00.raw.xz
  • +
  • lo.2013_2.00.raw.xz
  • +
  • lo.2014_1.00.raw.xz
  • +
  • lt.2012.00.raw.xz
  • +
  • lt.2013_1.00.raw.xz
  • +
  • lt.2013_2.00.raw.xz
  • +
  • lt.2014_1.00.raw.xz
  • +
  • lv.2012.00.raw.xz
  • +
  • lv.2013_1.00.raw.xz
  • +
  • lv.2013_2.00.raw.xz
  • +
  • lv.2014_1.00.raw.xz
  • +
  • mfe.2012.00.raw.xz
  • +
  • mfe.2013_1.00.raw.xz
  • +
  • mfe.2013_2.00.raw.xz
  • +
  • mfe.2014_1.00.raw.xz
  • +
  • mg.2012.00.raw.xz
  • +
  • mg.2013_1.00.raw.xz
  • +
  • mg.2013_2.00.raw.xz
  • +
  • mg.2014_1.00.raw.xz
  • +
  • mi.2012.00.raw.xz
  • +
  • mi.2013_1.00.raw.xz
  • +
  • mi.2013_2.00.raw.xz
  • +
  • mi.2014_1.00.raw.xz
  • +
  • mk.2012.00.raw.xz
  • +
  • mk.2013_1.00.raw.xz
  • +
  • mk.2013_2.00.raw.xz
  • +
  • mk.2014_1.00.raw.xz
  • +
  • ml.2012.00.raw.xz
  • +
  • ml.2013_1.00.raw.xz
  • +
  • ml.2013_2.00.raw.xz
  • +
  • ml.2014_1.00.raw.xz
  • +
  • mn.2012.00.raw.xz
  • +
  • mn.2013_1.00.raw.xz
  • +
  • mn.2013_2.00.raw.xz
  • +
  • mn.2014_1.00.raw.xz
  • +
  • mr.2012.00.raw.xz
  • +
  • mr.2013_1.00.raw.xz
  • +
  • mr.2013_2.00.raw.xz
  • +
  • mr.2014_1.00.raw.xz
  • +
  • ms.2012.00.raw.xz
  • +
  • ms.2013_1.00.raw.xz
  • +
  • ms.2013_2.00.raw.xz
  • +
  • ms.2014_1.00.raw.xz
  • +
  • mt.2012.00.raw.xz
  • +
  • mt.2013_1.00.raw.xz
  • +
  • mt.2013_2.00.raw.xz
  • +
  • mt.2014_1.00.raw.xz
  • +
  • my.2012.00.raw.xz
  • +
  • my.2013_1.00.raw.xz
  • +
  • my.2013_2.00.raw.xz
  • +
  • my.2014_1.00.raw.xz
  • +
  • na.2012.00.raw.xz
  • +
  • na.2013_1.00.raw.xz
  • +
  • na.2013_2.00.raw.xz
  • +
  • na.2014_1.00.raw.xz
  • +
  • ne.2012.00.raw.xz
  • +
  • ne.2013_1.00.raw.xz
  • +
  • ne.2013_2.00.raw.xz
  • +
  • ne.2014_1.00.raw.xz
  • +
  • nl.2012.00.raw.xz
  • +
  • nl.2013_1.00.raw.xz
  • +
  • nl.2013_2.00.raw.xz
  • +
  • nl.2014_1.00.raw.xz
  • +
  • nn.2012.00.raw.xz
  • +
  • nn.2013_1.00.raw.xz
  • +
  • nn.2013_2.00.raw.xz
  • +
  • nn.2014_1.00.raw.xz
  • +
  • no.2012.00.raw.xz
  • +
  • no.2013_1.00.raw.xz
  • +
  • no.2013_2.00.raw.xz
  • +
  • no.2014_1.00.raw.xz
  • +
  • nso.2012.00.raw.xz
  • +
  • nso.2013_1.00.raw.xz
  • +
  • nso.2013_2.00.raw.xz
  • +
  • nso.2014_1.00.raw.xz
  • +
  • ny.2012.00.raw.xz
  • +
  • ny.2013_1.00.raw.xz
  • +
  • ny.2013_2.00.raw.xz
  • +
  • ny.2014_1.00.raw.xz
  • +
  • oc.2012.00.raw.xz
  • +
  • oc.2013_1.00.raw.xz
  • +
  • oc.2013_2.00.raw.xz
  • +
  • oc.2014_1.00.raw.xz
  • +
  • om.2012.00.raw.xz
  • +
  • om.2013_1.00.raw.xz
  • +
  • om.2013_2.00.raw.xz
  • +
  • om.2014_1.00.raw.xz
  • +
  • or.2012.00.raw.xz
  • +
  • or.2013_1.00.raw.xz
  • +
  • or.2013_2.00.raw.xz
  • +
  • or.2014_1.00.raw.xz
  • +
  • pa.2012.00.raw.xz
  • +
  • pa.2013_1.00.raw.xz
  • +
  • pa.2013_2.00.raw.xz
  • +
  • pa.2014_1.00.raw.xz
  • +
  • pl.2012.00.raw.xz
  • +
  • pl.2013_1.00.raw.xz
  • +
  • pl.2013_2.00.raw.xz
  • +
  • pl.2014_1.00.raw.xz
  • +
  • ps.2012.00.raw.xz
  • +
  • ps.2013_1.00.raw.xz
  • +
  • ps.2013_2.00.raw.xz
  • +
  • ps.2014_1.00.raw.xz
  • +
  • pt.2012.00.raw.xz
  • +
  • pt.2013_1.00.raw.xz
  • +
  • pt.2013_2.00.raw.xz
  • +
  • pt.2014_1.00.raw.xz
  • +
  • qu.2012.00.raw.xz
  • +
  • qu.2013_1.00.raw.xz
  • +
  • qu.2013_2.00.raw.xz
  • +
  • qu.2014_1.00.raw.xz
  • +
  • rm.2012.00.raw.xz
  • +
  • rm.2013_1.00.raw.xz
  • +
  • rm.2013_2.00.raw.xz
  • +
  • rm.2014_1.00.raw.xz
  • +
  • rn.2012.00.raw.xz
  • +
  • rn.2013_1.00.raw.xz
  • +
  • rn.2013_2.00.raw.xz
  • +
  • rn.2014_1.00.raw.xz
  • +
  • ro.2012.00.raw.xz
  • +
  • ro.2013_1.00.raw.xz
  • +
  • ro.2013_2.00.raw.xz
  • +
  • ro.2014_1.00.raw.xz
  • +
  • ru.2012.00.raw.xz
  • +
  • ru.2013_1.00.raw.xz
  • +
  • ru.2013_2.00.raw.xz
  • +
  • ru.2014_1.00.raw.xz
  • +
  • rw.2012.00.raw.xz
  • +
  • rw.2013_1.00.raw.xz
  • +
  • rw.2013_2.00.raw.xz
  • +
  • rw.2014_1.00.raw.xz
  • +
  • sa.2012.00.raw.xz
  • +
  • sa.2013_1.00.raw.xz
  • +
  • sa.2013_2.00.raw.xz
  • +
  • sa.2014_1.00.raw.xz
  • +
  • sco.2012.00.raw.xz
  • +
  • sco.2013_1.00.raw.xz
  • +
  • sco.2013_2.00.raw.xz
  • +
  • sco.2014_1.00.raw.xz
  • +
  • sd.2012.00.raw.xz
  • +
  • sd.2013_1.00.raw.xz
  • +
  • sd.2013_2.00.raw.xz
  • +
  • sd.2014_1.00.raw.xz
  • +
  • sg.2012.00.raw.xz
  • +
  • sg.2013_1.00.raw.xz
  • +
  • sg.2013_2.00.raw.xz
  • +
  • sg.2014_1.00.raw.xz
  • +
  • si.2012.00.raw.xz
  • +
  • si.2013_1.00.raw.xz
  • +
  • si.2013_2.00.raw.xz
  • +
  • si.2014_1.00.raw.xz
  • +
  • sk.2012.00.raw.xz
  • +
  • sk.2013_1.00.raw.xz
  • +
  • sk.2013_2.00.raw.xz
  • +
  • sk.2014_1.00.raw.xz
  • +
  • sl.2012.00.raw.xz
  • +
  • sl.2013_1.00.raw.xz
  • +
  • sl.2013_2.00.raw.xz
  • +
  • sl.2014_1.00.raw.xz
  • +
  • sm.2012.00.raw.xz
  • +
  • sm.2013_1.00.raw.xz
  • +
  • sm.2013_2.00.raw.xz
  • +
  • sm.2014_1.00.raw.xz
  • +
  • sn.2012.00.raw.xz
  • +
  • sn.2013_1.00.raw.xz
  • +
  • sn.2013_2.00.raw.xz
  • +
  • sn.2014_1.00.raw.xz
  • +
  • so.2012.00.raw.xz
  • +
  • so.2013_1.00.raw.xz
  • +
  • so.2013_2.00.raw.xz
  • +
  • so.2014_1.00.raw.xz
  • +
  • sq.2012.00.raw.xz
  • +
  • sq.2013_1.00.raw.xz
  • +
  • sq.2013_2.00.raw.xz
  • +
  • sq.2014_1.00.raw.xz
  • +
  • sr.2012.00.raw.xz
  • +
  • sr.2013_1.00.raw.xz
  • +
  • sr.2013_2.00.raw.xz
  • +
  • sr.2014_1.00.raw.xz
  • +
  • sr-ME.2012.00.raw.xz
  • +
  • sr-ME.2013_1.00.raw.xz
  • +
  • sr-ME.2013_2.00.raw.xz
  • +
  • sr-ME.2014_1.00.raw.xz
  • +
  • ss.2012.00.raw.xz
  • +
  • ss.2013_1.00.raw.xz
  • +
  • ss.2013_2.00.raw.xz
  • +
  • ss.2014_1.00.raw.xz
  • +
  • st.2012.00.raw.xz
  • +
  • st.2013_1.00.raw.xz
  • +
  • st.2013_2.00.raw.xz
  • +
  • st.2014_1.00.raw.xz
  • +
  • su.2012.00.raw.xz
  • +
  • su.2013_1.00.raw.xz
  • +
  • su.2013_2.00.raw.xz
  • +
  • su.2014_1.00.raw.xz
  • +
  • sv.2012.00.raw.xz
  • +
  • sv.2013_1.00.raw.xz
  • +
  • sv.2013_2.00.raw.xz
  • +
  • sv.2014_1.00.raw.xz
  • +
  • sw.2012.00.raw.xz
  • +
  • sw.2013_1.00.raw.xz
  • +
  • sw.2013_2.00.raw.xz
  • +
  • sw.2014_1.00.raw.xz
  • +
  • syr.2012.00.raw.xz
  • +
  • syr.2013_1.00.raw.xz
  • +
  • syr.2013_2.00.raw.xz
  • +
  • syr.2014_1.00.raw.xz
  • +
  • ta.2012.00.raw.xz
  • +
  • ta.2013_1.00.raw.xz
  • +
  • ta.2013_2.00.raw.xz
  • +
  • ta.2014_1.00.raw.xz
  • +
  • te.2012.00.raw.xz
  • +
  • te.2013_1.00.raw.xz
  • +
  • te.2013_2.00.raw.xz
  • +
  • te.2014_1.00.raw.xz
  • +
  • tg.2012.00.raw.xz
  • +
  • tg.2013_1.00.raw.xz
  • +
  • tg.2013_2.00.raw.xz
  • +
  • tg.2014_1.00.raw.xz
  • +
  • th.2012.00.raw.xz
  • +
  • th.2013_1.00.raw.xz
  • +
  • th.2013_2.00.raw.xz
  • +
  • th.2014_1.00.raw.xz
  • +
  • ti.2012.00.raw.xz
  • +
  • ti.2013_1.00.raw.xz
  • +
  • ti.2013_2.00.raw.xz
  • +
  • ti.2014_1.00.raw.xz
  • +
  • tk.2012.00.raw.xz
  • +
  • tk.2013_1.00.raw.xz
  • +
  • tk.2013_2.00.raw.xz
  • +
  • tk.2014_1.00.raw.xz
  • +
  • tl.2012.00.raw.xz
  • +
  • tl.2013_1.00.raw.xz
  • +
  • tl.2013_2.00.raw.xz
  • +
  • tl.2014_1.00.raw.xz
  • +
  • tlh.2012.00.raw.xz
  • +
  • tlh.2013_1.00.raw.xz
  • +
  • tlh.2013_2.00.raw.xz
  • +
  • tlh.2014_1.00.raw.xz
  • +
  • tn.2012.00.raw.xz
  • +
  • tn.2013_1.00.raw.xz
  • +
  • tn.2013_2.00.raw.xz
  • +
  • tn.2014_1.00.raw.xz
  • +
  • to.2012.00.raw.xz
  • +
  • to.2013_1.00.raw.xz
  • +
  • to.2013_2.00.raw.xz
  • +
  • to.2014_1.00.raw.xz
  • +
  • tr.2012.00.raw.xz
  • +
  • tr.2013_1.00.raw.xz
  • +
  • tr.2013_2.00.raw.xz
  • +
  • tr.2014_1.00.raw.xz
  • +
  • ts.2012.00.raw.xz
  • +
  • ts.2013_1.00.raw.xz
  • +
  • ts.2013_2.00.raw.xz
  • +
  • ts.2014_1.00.raw.xz
  • +
  • tt.2012.00.raw.xz
  • +
  • tt.2013_1.00.raw.xz
  • +
  • tt.2013_2.00.raw.xz
  • +
  • tt.2014_1.00.raw.xz
  • +
  • ug.2012.00.raw.xz
  • +
  • ug.2013_1.00.raw.xz
  • +
  • ug.2013_2.00.raw.xz
  • +
  • ug.2014_1.00.raw.xz
  • +
  • uk.2012.00.raw.xz
  • +
  • uk.2013_1.00.raw.xz
  • +
  • uk.2013_2.00.raw.xz
  • +
  • uk.2014_1.00.raw.xz
  • +
  • un.2012.00.raw.xz
  • +
  • un.2013_1.00.raw.xz
  • +
  • ur.2012.00.raw.xz
  • +
  • ur.2013_1.00.raw.xz
  • +
  • ur.2013_2.00.raw.xz
  • +
  • ur.2014_1.00.raw.xz
  • +
  • uz.2012.00.raw.xz
  • +
  • uz.2013_1.00.raw.xz
  • +
  • uz.2013_2.00.raw.xz
  • +
  • uz.2014_1.00.raw.xz
  • +
  • ve.2012.00.raw.xz
  • +
  • ve.2013_1.00.raw.xz
  • +
  • ve.2013_2.00.raw.xz
  • +
  • ve.2014_1.00.raw.xz
  • +
  • vi.2012.00.raw.xz
  • +
  • vi.2013_1.00.raw.xz
  • +
  • vi.2013_2.00.raw.xz
  • +
  • vi.2014_1.00.raw.xz
  • +
  • vo.2012.00.raw.xz
  • +
  • vo.2013_1.00.raw.xz
  • +
  • vo.2013_2.00.raw.xz
  • +
  • vo.2014_1.00.raw.xz
  • +
  • war.2012.00.raw.xz
  • +
  • war.2013_1.00.raw.xz
  • +
  • war.2013_2.00.raw.xz
  • +
  • war.2014_1.00.raw.xz
  • +
  • wo.2012.00.raw.xz
  • +
  • wo.2013_1.00.raw.xz
  • +
  • wo.2013_2.00.raw.xz
  • +
  • wo.2014_1.00.raw.xz
  • +
  • xh.2012.00.raw.xz
  • +
  • xh.2013_1.00.raw.xz
  • +
  • xh.2013_2.00.raw.xz
  • +
  • xh.2014_1.00.raw.xz
  • +
  • xx-Armi.2012.00.raw.xz
  • +
  • xx-Armi.2013_1.00.raw.xz
  • +
  • xx-Armi.2013_2.00.raw.xz
  • +
  • xx-Armi.2014_1.00.raw.xz
  • +
  • xx-Avst.2012.00.raw.xz
  • +
  • xx-Avst.2013_1.00.raw.xz
  • +
  • xx-Avst.2013_2.00.raw.xz
  • +
  • xx-Avst.2014_1.00.raw.xz
  • +
  • xx-Bali.2012.00.raw.xz
  • +
  • xx-Bali.2013_1.00.raw.xz
  • +
  • xx-Bali.2013_2.00.raw.xz
  • +
  • xx-Bali.2014_1.00.raw.xz
  • +
  • xx-Bamu.2012.00.raw.xz
  • +
  • xx-Bamu.2013_1.00.raw.xz
  • +
  • xx-Bamu.2013_2.00.raw.xz
  • +
  • xx-Bamu.2014_1.00.raw.xz
  • +
  • xx-Batk.2012.00.raw.xz
  • +
  • xx-Batk.2013_1.00.raw.xz
  • +
  • xx-Batk.2013_2.00.raw.xz
  • +
  • xx-Batk.2014_1.00.raw.xz
  • +
  • xx-Bopo.2012.00.raw.xz
  • +
  • xx-Bopo.2013_1.00.raw.xz
  • +
  • xx-Bopo.2013_2.00.raw.xz
  • +
  • xx-Bopo.2014_1.00.raw.xz
  • +
  • xx-Brah.2012.00.raw.xz
  • +
  • xx-Brah.2013_1.00.raw.xz
  • +
  • xx-Brah.2013_2.00.raw.xz
  • +
  • xx-Brah.2014_1.00.raw.xz
  • +
  • xx-Bugi.2012.00.raw.xz
  • +
  • xx-Bugi.2013_1.00.raw.xz
  • +
  • xx-Bugi.2013_2.00.raw.xz
  • +
  • xx-Bugi.2014_1.00.raw.xz
  • +
  • xx-Buhd.2012.00.raw.xz
  • +
  • xx-Buhd.2013_1.00.raw.xz
  • +
  • xx-Buhd.2013_2.00.raw.xz
  • +
  • xx-Buhd.2014_1.00.raw.xz
  • +
  • xx-Cakm.2012.00.raw.xz
  • +
  • xx-Cakm.2013_1.00.raw.xz
  • +
  • xx-Cakm.2013_2.00.raw.xz
  • +
  • xx-Cakm.2014_1.00.raw.xz
  • +
  • xx-Cari.2012.00.raw.xz
  • +
  • xx-Cari.2013_1.00.raw.xz
  • +
  • xx-Cari.2013_2.00.raw.xz
  • +
  • xx-Cari.2014_1.00.raw.xz
  • +
  • xx-Cham.2012.00.raw.xz
  • +
  • xx-Cham.2013_1.00.raw.xz
  • +
  • xx-Cham.2013_2.00.raw.xz
  • +
  • xx-Cham.2014_1.00.raw.xz
  • +
  • xx-Copt.2012.00.raw.xz
  • +
  • xx-Copt.2013_1.00.raw.xz
  • +
  • xx-Copt.2013_2.00.raw.xz
  • +
  • xx-Copt.2014_1.00.raw.xz
  • +
  • xx-Cprt.2012.00.raw.xz
  • +
  • xx-Cprt.2013_1.00.raw.xz
  • +
  • xx-Cprt.2013_2.00.raw.xz
  • +
  • xx-Cprt.2014_1.00.raw.xz
  • +
  • xx-Dsrt.2012.00.raw.xz
  • +
  • xx-Dsrt.2013_1.00.raw.xz
  • +
  • xx-Dsrt.2013_2.00.raw.xz
  • +
  • xx-Dsrt.2014_1.00.raw.xz
  • +
  • xx-Egyp.2012.00.raw.xz
  • +
  • xx-Egyp.2013_1.00.raw.xz
  • +
  • xx-Egyp.2013_2.00.raw.xz
  • +
  • xx-Egyp.2014_1.00.raw.xz
  • +
  • xx-Glag.2012.00.raw.xz
  • +
  • xx-Glag.2013_1.00.raw.xz
  • +
  • xx-Glag.2013_2.00.raw.xz
  • +
  • xx-Glag.2014_1.00.raw.xz
  • +
  • xx-Goth.2012.00.raw.xz
  • +
  • xx-Goth.2013_1.00.raw.xz
  • +
  • xx-Goth.2013_2.00.raw.xz
  • +
  • xx-Goth.2014_1.00.raw.xz
  • +
  • xx-Hano.2012.00.raw.xz
  • +
  • xx-Hano.2013_1.00.raw.xz
  • +
  • xx-Hano.2013_2.00.raw.xz
  • +
  • xx-Hano.2014_1.00.raw.xz
  • +
  • xx-Ital.2012.00.raw.xz
  • +
  • xx-Ital.2013_1.00.raw.xz
  • +
  • xx-Ital.2013_2.00.raw.xz
  • +
  • xx-Ital.2014_1.00.raw.xz
  • +
  • xx-Java.2012.00.raw.xz
  • +
  • xx-Java.2013_1.00.raw.xz
  • +
  • xx-Java.2013_2.00.raw.xz
  • +
  • xx-Java.2014_1.00.raw.xz
  • +
  • xx-Kali.2012.00.raw.xz
  • +
  • xx-Kali.2013_1.00.raw.xz
  • +
  • xx-Kali.2013_2.00.raw.xz
  • +
  • xx-Kali.2014_1.00.raw.xz
  • +
  • xx-Khar.2012.00.raw.xz
  • +
  • xx-Khar.2013_1.00.raw.xz
  • +
  • xx-Khar.2013_2.00.raw.xz
  • +
  • xx-Khar.2014_1.00.raw.xz
  • +
  • xx-Kthi.2012.00.raw.xz
  • +
  • xx-Kthi.2013_1.00.raw.xz
  • +
  • xx-Kthi.2013_2.00.raw.xz
  • +
  • xx-Kthi.2014_1.00.raw.xz
  • +
  • xx-Lana.2012.00.raw.xz
  • +
  • xx-Lana.2013_1.00.raw.xz
  • +
  • xx-Lana.2013_2.00.raw.xz
  • +
  • xx-Lana.2014_1.00.raw.xz
  • +
  • xx-Lepc.2012.00.raw.xz
  • +
  • xx-Lepc.2013_1.00.raw.xz
  • +
  • xx-Lepc.2013_2.00.raw.xz
  • +
  • xx-Lepc.2014_1.00.raw.xz
  • +
  • xx-Linb.2012.00.raw.xz
  • +
  • xx-Linb.2013_1.00.raw.xz
  • +
  • xx-Linb.2013_2.00.raw.xz
  • +
  • xx-Linb.2014_1.00.raw.xz
  • +
  • xx-Lisu.2012.00.raw.xz
  • +
  • xx-Lisu.2013_1.00.raw.xz
  • +
  • xx-Lisu.2013_2.00.raw.xz
  • +
  • xx-Lisu.2014_1.00.raw.xz
  • +
  • xx-Lyci.2012.00.raw.xz
  • +
  • xx-Lyci.2013_1.00.raw.xz
  • +
  • xx-Lyci.2013_2.00.raw.xz
  • +
  • xx-Lyci.2014_1.00.raw.xz
  • +
  • xx-Lydi.2012.00.raw.xz
  • +
  • xx-Lydi.2013_1.00.raw.xz
  • +
  • xx-Lydi.2013_2.00.raw.xz
  • +
  • xx-Lydi.2014_1.00.raw.xz
  • +
  • xx-Mand.2012.00.raw.xz
  • +
  • xx-Mand.2013_1.00.raw.xz
  • +
  • xx-Mand.2013_2.00.raw.xz
  • +
  • xx-Mand.2014_1.00.raw.xz
  • +
  • xx-Merc.2012.00.raw.xz
  • +
  • xx-Merc.2013_1.00.raw.xz
  • +
  • xx-Merc.2013_2.00.raw.xz
  • +
  • xx-Merc.2014_1.00.raw.xz
  • +
  • xx-Mero.2012.00.raw.xz
  • +
  • xx-Mero.2013_1.00.raw.xz
  • +
  • xx-Mero.2013_2.00.raw.xz
  • +
  • xx-Mero.2014_1.00.raw.xz
  • +
  • xx-Mtei.2012.00.raw.xz
  • +
  • xx-Mtei.2013_1.00.raw.xz
  • +
  • xx-Mtei.2013_2.00.raw.xz
  • +
  • xx-Mtei.2014_1.00.raw.xz
  • +
  • xx-Nkoo.2012.00.raw.xz
  • +
  • xx-Nkoo.2013_1.00.raw.xz
  • +
  • xx-Nkoo.2013_2.00.raw.xz
  • +
  • xx-Nkoo.2014_1.00.raw.xz
  • +
  • xx-Ogam.2012.00.raw.xz
  • +
  • xx-Ogam.2013_1.00.raw.xz
  • +
  • xx-Ogam.2013_2.00.raw.xz
  • +
  • xx-Ogam.2014_1.00.raw.xz
  • +
  • xx-Olck.2012.00.raw.xz
  • +
  • xx-Olck.2013_1.00.raw.xz
  • +
  • xx-Olck.2013_2.00.raw.xz
  • +
  • xx-Olck.2014_1.00.raw.xz
  • +
  • xx-Orkh.2012.00.raw.xz
  • +
  • xx-Orkh.2013_1.00.raw.xz
  • +
  • xx-Orkh.2013_2.00.raw.xz
  • +
  • xx-Orkh.2014_1.00.raw.xz
  • +
  • xx-Osma.2012.00.raw.xz
  • +
  • xx-Osma.2013_1.00.raw.xz
  • +
  • xx-Osma.2013_2.00.raw.xz
  • +
  • xx-Osma.2014_1.00.raw.xz
  • +
  • xx-Phag.2012.00.raw.xz
  • +
  • xx-Phag.2013_1.00.raw.xz
  • +
  • xx-Phag.2013_2.00.raw.xz
  • +
  • xx-Phag.2014_1.00.raw.xz
  • +
  • xx-Phli.2012.00.raw.xz
  • +
  • xx-Phli.2013_1.00.raw.xz
  • +
  • xx-Phli.2013_2.00.raw.xz
  • +
  • xx-Phli.2014_1.00.raw.xz
  • +
  • xx-Phnx.2012.00.raw.xz
  • +
  • xx-Phnx.2013_1.00.raw.xz
  • +
  • xx-Phnx.2013_2.00.raw.xz
  • +
  • xx-Phnx.2014_1.00.raw.xz
  • +
  • xx-Plrd.2012.00.raw.xz
  • +
  • xx-Plrd.2013_1.00.raw.xz
  • +
  • xx-Plrd.2013_2.00.raw.xz
  • +
  • xx-Plrd.2014_1.00.raw.xz
  • +
  • xx-Prti.2012.00.raw.xz
  • +
  • xx-Prti.2013_1.00.raw.xz
  • +
  • xx-Prti.2013_2.00.raw.xz
  • +
  • xx-Prti.2014_1.00.raw.xz
  • +
  • xx-Qaai.2012.00.raw.xz
  • +
  • xx-Qaai.2013_1.00.raw.xz
  • +
  • xx-Qaai.2013_2.00.raw.xz
  • +
  • xx-Qaai.2014_1.00.raw.xz
  • +
  • xx-Rjng.2012.00.raw.xz
  • +
  • xx-Rjng.2013_1.00.raw.xz
  • +
  • xx-Rjng.2013_2.00.raw.xz
  • +
  • xx-Rjng.2014_1.00.raw.xz
  • +
  • xx-Runr.2012.00.raw.xz
  • +
  • xx-Runr.2013_1.00.raw.xz
  • +
  • xx-Runr.2013_2.00.raw.xz
  • +
  • xx-Runr.2014_1.00.raw.xz
  • +
  • xx-Samr.2012.00.raw.xz
  • +
  • xx-Samr.2013_1.00.raw.xz
  • +
  • xx-Samr.2013_2.00.raw.xz
  • +
  • xx-Samr.2014_1.00.raw.xz
  • +
  • xx-Sarb.2012.00.raw.xz
  • +
  • xx-Sarb.2013_1.00.raw.xz
  • +
  • xx-Sarb.2013_2.00.raw.xz
  • +
  • xx-Sarb.2014_1.00.raw.xz
  • +
  • xx-Saur.2012.00.raw.xz
  • +
  • xx-Saur.2013_1.00.raw.xz
  • +
  • xx-Saur.2013_2.00.raw.xz
  • +
  • xx-Saur.2014_1.00.raw.xz
  • +
  • xx-Shaw.2012.00.raw.xz
  • +
  • xx-Shaw.2013_1.00.raw.xz
  • +
  • xx-Shaw.2013_2.00.raw.xz
  • +
  • xx-Shaw.2014_1.00.raw.xz
  • +
  • xx-Shrd.2012.00.raw.xz
  • +
  • xx-Shrd.2013_1.00.raw.xz
  • +
  • xx-Shrd.2013_2.00.raw.xz
  • +
  • xx-Shrd.2014_1.00.raw.xz
  • +
  • xx-Sora.2012.00.raw.xz
  • +
  • xx-Sora.2013_1.00.raw.xz
  • +
  • xx-Sora.2013_2.00.raw.xz
  • +
  • xx-Sora.2014_1.00.raw.xz
  • +
  • xx-Sund.2012.00.raw.xz
  • +
  • xx-Sund.2013_1.00.raw.xz
  • +
  • xx-Sund.2013_2.00.raw.xz
  • +
  • xx-Sund.2014_1.00.raw.xz
  • +
  • xx-Sylo.2012.00.raw.xz
  • +
  • xx-Sylo.2013_1.00.raw.xz
  • +
  • xx-Sylo.2013_2.00.raw.xz
  • +
  • xx-Sylo.2014_1.00.raw.xz
  • +
  • xx-Tagb.2012.00.raw.xz
  • +
  • xx-Tagb.2013_1.00.raw.xz
  • +
  • xx-Tagb.2013_2.00.raw.xz
  • +
  • xx-Tagb.2014_1.00.raw.xz
  • +
  • xx-Takr.2012.00.raw.xz
  • +
  • xx-Takr.2013_1.00.raw.xz
  • +
  • xx-Takr.2013_2.00.raw.xz
  • +
  • xx-Takr.2014_1.00.raw.xz
  • +
  • xx-Tale.2012.00.raw.xz
  • +
  • xx-Tale.2013_1.00.raw.xz
  • +
  • xx-Tale.2013_2.00.raw.xz
  • +
  • xx-Tale.2014_1.00.raw.xz
  • +
  • xx-Talu.2012.00.raw.xz
  • +
  • xx-Talu.2013_1.00.raw.xz
  • +
  • xx-Talu.2013_2.00.raw.xz
  • +
  • xx-Talu.2014_1.00.raw.xz
  • +
  • xx-Tavt.2012.00.raw.xz
  • +
  • xx-Tavt.2013_1.00.raw.xz
  • +
  • xx-Tavt.2013_2.00.raw.xz
  • +
  • xx-Tavt.2014_1.00.raw.xz
  • +
  • xx-Tfng.2012.00.raw.xz
  • +
  • xx-Tfng.2013_1.00.raw.xz
  • +
  • xx-Tfng.2013_2.00.raw.xz
  • +
  • xx-Tfng.2014_1.00.raw.xz
  • +
  • xx-Ugar.2012.00.raw.xz
  • +
  • xx-Ugar.2013_1.00.raw.xz
  • +
  • xx-Ugar.2013_2.00.raw.xz
  • +
  • xx-Ugar.2014_1.00.raw.xz
  • +
  • xx-Vaii.2012.00.raw.xz
  • +
  • xx-Vaii.2013_1.00.raw.xz
  • +
  • xx-Vaii.2013_2.00.raw.xz
  • +
  • xx-Vaii.2014_1.00.raw.xz
  • +
  • xx-Xpeo.2012.00.raw.xz
  • +
  • xx-Xpeo.2013_1.00.raw.xz
  • +
  • xx-Xpeo.2013_2.00.raw.xz
  • +
  • xx-Xpeo.2014_1.00.raw.xz
  • +
  • xx-Xsux.2012.00.raw.xz
  • +
  • xx-Xsux.2013_1.00.raw.xz
  • +
  • xx-Xsux.2013_2.00.raw.xz
  • +
  • xx-Xsux.2014_1.00.raw.xz
  • +
  • xx-Yiii.2012.00.raw.xz
  • +
  • xx-Yiii.2013_1.00.raw.xz
  • +
  • xx-Yiii.2013_2.00.raw.xz
  • +
  • xx-Yiii.2014_1.00.raw.xz
  • +
  • yi.2012.00.raw.xz
  • +
  • yi.2013_1.00.raw.xz
  • +
  • yi.2013_2.00.raw.xz
  • +
  • yi.2014_1.00.raw.xz
  • +
  • yo.2012.00.raw.xz
  • +
  • yo.2013_1.00.raw.xz
  • +
  • yo.2013_2.00.raw.xz
  • +
  • yo.2014_1.00.raw.xz
  • +
  • za.2012.00.raw.xz
  • +
  • za.2013_1.00.raw.xz
  • +
  • za.2013_2.00.raw.xz
  • +
  • za.2014_1.00.raw.xz
  • +
  • zh.2012.00.raw.xz
  • +
  • zh.2013_1.00.raw.xz
  • +
  • zh.2013_2.00.raw.xz
  • +
  • zh.2014_1.00.raw.xz
  • +
  • zh-Hant.2012.00.raw.xz
  • +
  • zh-Hant.2013_1.00.raw.xz
  • +
  • zh-Hant.2013_2.00.raw.xz
  • +
  • zh-Hant.2014_1.00.raw.xz
  • +
  • zu.2012.00.raw.xz
  • +
  • zu.2013_1.00.raw.xz
  • +
  • zu.2013_2.00.raw.xz
  • +
  • zu.2014_1.00.raw.xz
  • +
  • zzp.2012.00.raw.xz
  • +
  • zzp.2013_1.00.raw.xz
  • +
  • zzp.2013_2.00.raw.xz
  • +
  • zzp.2014_1.00.raw.xz
  • diff --git a/s3/make_non_en_public.sh b/s3/make_non_en_public.sh new file mode 100755 index 0000000..8956688 --- /dev/null +++ b/s3/make_non_en_public.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +RAW_DIR="/fs/vali0/www/data.statmt.org/ngrams/raw" +LANGS=$(ls ${RAW_DIR}/*.xz | awk 'BEGIN {FS="/"} {print $(NF)}' | cut -d '.' -f 1 | uniq) + +for lang in $LANGS; do + s3cmd setacl --acl-public -r "s3://web-language-models/ngrams/${lang}/" +done diff --git a/s3/rename_raw.sh b/s3/rename_raw.sh new file mode 100755 index 0000000..22fd6e0 --- /dev/null +++ b/s3/rename_raw.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +LANGUAGES="$1" +DELETIONS="$2" +DELETION_ERRORS="$3" +RENAMINGS="$4" +RENAMING_ERRORS="$5" + +# TODO: Dry test run. +# TODO: Test move and delete commands. + +for LANG in $(cat $LANGUAGES); do + S3_RAW_BUCKET="s3://web-language-models/ngrams/${LANG}/raw" + + S3_2014_1="${S3_RAW_BUCKET}/${LANG}.2014_1.00.raw.xz" + s3cmd info "${S3_2014_1}" > /dev/null 2>&1 + if [[ $? -eq 0 ]]; then + #s3cmd del "${S3_2014_1}" + if [[ $? -eq 0 ]]; then + echo "${S3_2014_1}" >> ${DELETIONS} + else + echo "${S3_2014_1}" >> ${DELETION_ERRORS} + fi + else + echo "${S3_2014_1}" >> ${DELETION_ERRORS} + fi + + for ID in 2012 2013_1 2013_2; do + S3_NAME="${S3_RAW_BUCKET}/${LANG}.${ID}.00.raw.xz" + S3_NEW_NAME="${S3_RAW_BUCKET}/${LANG}.${ID}.raw.xz" + s3cmd ls "${S3_NAME}" > /dev/null 2>&1 + if [[ $? -eq 0 ]]; then + #s3cmd mv "${S3_NAME}" "${S3_NEW_NAME}" + if [[ $? -eq 0 ]]; then + echo "${S3_NAME} ${S3_NEW_NAME}" >> ${RENAMINGS} + else + echo "${S3_NAME}" >> ${RENAMING_ERRORS} + fi + else + echo "${S3_NAME}" >> ${RENAMING_ERRORS} + fi + done +done diff --git a/s3/s3_copy.py b/s3/s3_copy.py new file mode 100755 index 0000000..6a77979 --- /dev/null +++ b/s3/s3_copy.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +# ngrams//fs/nas/eikthyrnir0/tim/cc/deduped/all_years/de/deduped/de.deduped.xz + +import argparse +import boto3 +from boto3.s3.transfer import TransferConfig + +parser = argparse.ArgumentParser() +parser.add_argument('-chunksize', type=int, default=1000, help='size of each part in MB') +parser.add_argument('-sourcekey', help='source object location') +parser.add_argument('-targetkey', help='location to copy to') +args = parser.parse_args() + +# Convert chunksize from MB to bytes +chunksize = args.chunksize * 1000000 +transferConfig = TransferConfig(multipart_threshold=chunksize, multipart_chunksize=chunksize) + +s3 = boto3.resource('s3') +copy_source = { + 'Bucket': 'web-language-models', + 'Key': args.sourcekey +} +s3.meta.client.copy(copy_source, 'web-language-models', args.targetkey, Config=transferConfig) diff --git a/s3/upload_deduped.sh b/s3/upload_deduped.sh new file mode 100755 index 0000000..9cd64fd --- /dev/null +++ b/s3/upload_deduped.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -e + +FILE="$1" +LANG=$(basename $FILE | cut -d . -f 1) + +/home/tim/bin/s3cmd/s3cmd del "s3://web-language-models/ngrams/${LANG}/deduped/${LANG}.deduped.xz" +/home/tim/bin/s3cmd/s3cmd put --multipart-chunk-size-mb=1000 "${FILE}" "s3://web-language-models/ngrams/${LANG}/deduped/" diff --git a/s3/upload_raw.sh b/s3/upload_raw.sh new file mode 100755 index 0000000..dd2ccec --- /dev/null +++ b/s3/upload_raw.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +FILE=$(basename $1 | sed 's/2013_2/2013_48/g' | sed 's/2013_1/2013_20/g') +LANGUAGE=$(echo ${FILE} | cut -d . -f 1) + +BUCKET="s3://web-language-models/ngrams/${LANGUAGE}/raw/${FILE}" + +/fs/zisa0/tim/dev/s3cmd/s3cmd put -q --continue-put --multipart-chunk-size-mb=1000 $1 ${BUCKET}