diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c92d00f --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +jhu/ diff --git a/LOCATIONS.md b/LOCATIONS.md new file mode 100644 index 0000000..39a8bf9 --- /dev/null +++ b/LOCATIONS.md @@ -0,0 +1,46 @@ +# Locations of data + +## /fs/zisa0/commoncrawl + +- 2015_27 raw non-english +- 2016_30 raw non-english +- 2017_17 experiments with extracting parallel text + +## /fs/freyja0/commoncrawl + +- 2015_06 raw non-english +- 2015_27 langsplit files +- 2015_30 langsplit files +- 2017_17 langsplit files + +## /fs/mimir0/commoncrawl + +- 2015_06 english raw +- 2015_11, 2015_14, 2015_18, 2015_22, 2015_27, 2015_27, 2015_32, 2015_35, 2015_40, 2015_48, 2016_50, 2017_17 all raw + +## /fs/nas/tim/cc + +- 2015_11, 2015_14 english raw +- deduped files for ar, cs, de, es, fr, it, pl, ru + +## /fs/nas/heithrun0/commoncrawl/langsplit + +- langsplit files for all crawls from 2013_20 up to 2015_48 and for 2016_50 +- some scripts and files from Christian which seem to be related to the parallel corpus extraction + +## /fs/vili0/buck/cc/langsplit2/raw + +- non-english raw files for all 2014 crawls + +## /fs/vili0/buck/cc/langsplit2 and /fs/vili0/buck/cc/langsplit + +- temporary data between the langsplit files and the raw files for 2014 and 2015 crawls, potential candidate for deletion + +## /fs/vili0/www/data.statmt.org/ngrams + +- home directory of the "data.statmt.org/ngrams" website, contains symbolic links to old raw data + +## /fs/gna0/buck/cc/db + +- contains RocksDB Index data for all crawls from 2012 to 2015_40 + 2016_50; used in the parallel corpus extraction pipeline + diff --git a/TODO b/TODO new file mode 100644 index 0000000..1b98c39 --- /dev/null +++ b/TODO @@ -0,0 +1,3 @@ +- Create deduped files for all minor languages that are not present yet +- Update english deduped files +- Copy the English language trie on AWS S3 diff --git a/deduped/README.md b/deduped/README.md new file mode 100644 index 0000000..45d7746 --- /dev/null +++ b/deduped/README.md @@ -0,0 +1,10 @@ +# Deduping .raw files + +## Dedupe + + +## Shard and dedupe + +If the all of the raw data of one language is too big to fit into memory we have to shard the raw into multiple files. This is usually done with English. +Before the sharding we do some minor processing of the raw data which removes lines with the document delimiter hash (df6fa1abb58549287111ba8d776733e9), +strip leading and trailing white space and remove lines with invalid UTF-8. diff --git a/deduped/compress_shard.sh b/deduped/compress_shard.sh new file mode 100755 index 0000000..9d045db --- /dev/null +++ b/deduped/compress_shard.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -e +set -o pipefail + +ID="$1" +SHARD_DIR="$2" +OUT_DIR="$3" + +INPUT_FILE="${SHARD_DIR}/en.tmp${ID}" +OUTPUT_FILE="${OUT_DIR}/en.tmp${ID}.gz" +DONEFILE="${OUTPUT_FILE}.done" + +if [[ -f "${DONEFILE}" ]]; then + exit 0 +fi + +< "${INPUT_FILE}" gzip -c > "${OUTPUT_FILE}" + +touch "${DONEFILE}" diff --git a/deduped/deduped_from_shard.sh b/deduped/deduped_from_shard.sh new file mode 100755 index 0000000..ee82961 --- /dev/null +++ b/deduped/deduped_from_shard.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e +set -o pipefail + +# Non-zero padded id. +ID="$1" +SHARD_DIR="$2" +PREVIOUS_DEDUPED_DIR="$3" +OUT_DIR="$4" + +PREPROCESS_DIR="/fs/zisa0/tim/dev/preprocess/bin" + +PADDED_ID=$(printf "%02d" ${ID}) +INPUT_FILE="${SHARD_DIR}/en.tmp${ID}" +OUTPUT_FILE="${OUT_DIR}/en.${PADDED_ID}.deduped.xz" +DONEFILE="${OUTPUT_FILE}.done" + +PREVIOUS_DEDUPED_FILE="${PREVIOUS_DEDUPED_DIR}/en.${PADDED_ID}.deduped.xz" + +if [[ -f "${DONEFILE}" ]]; then + exit 0 +fi + + +<"${INPUT_FILE}" ${PREPROCESS_DIR}/commoncrawl_dedupe ${PREVIOUS_DEDUPED_FILE} | xz > "${OUTPUT_FILE}" + +rm "${INPUT_FILE}" + +touch "${DONEFILE}" diff --git a/deduped/shard_fifo.sh b/deduped/shard_fifo.sh new file mode 100755 index 0000000..4f7a8d8 --- /dev/null +++ b/deduped/shard_fifo.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -e +set -o pipefail + +RAW_DIR="$1" +TMP_DIR="$2" + +PREPROCESS_DIR="/fs/zisa0/tim/dev/preprocess/bin" +RAW_FILES="${RAW_DIR}/*.raw.xz" + +TMP_PREFIX="en.tmp" +SHARD_COUNT=100 + +# Create named pipes +for i in $(seq 0 $((SHARD_COUNT-1))); do + mkfifo "${TMP_DIR}/${TMP_PREFIX}${i}" +done + +# Clean raw files and shard them into pipes +/fs/zisa0/tim/bin/xz -T10 -cd ${RAW_FILES} | \ + ${PREPROCESS_DIR}/commoncrawl_clean | \ + ${PREPROCESS_DIR}/shard_fifo ${TMP_DIR}/${TMP_PREFIX} ${SHARD_COUNT} diff --git a/download/README.md b/download/README.md new file mode 100644 index 0000000..99b4b56 --- /dev/null +++ b/download/README.md @@ -0,0 +1,3 @@ +# Download CommonCrawl data + +Scripts for the monolingual pipeline as described in [here](https://github.com/ModernMT/DataCollection/blob/master/metadata/metadata.md). `setup.sh` creates all necessary directories and downloads all target urls. `download.sh` does the actual download. `count_downloads.sh` counts how many of the files are already downloaded. diff --git a/download/count_downloads.sh b/download/count_downloads.sh new file mode 100755 index 0000000..ca222c2 --- /dev/null +++ b/download/count_downloads.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e +set -o pipefail + +total=0 +downloaded=0 +echo "$total"; echo -en "\e[1A" +for path in `cat $1`; do + echo -e "\e[0K\r $total"; echo -en "\e[1A" + total=$((total+1)) + FILENAME=$(echo $path | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF)}') + if [ -f ${FILENAME}.done ]; then + downloaded=$((downloaded+1)) + fi +done + +echo "$downloaded/$total" +echo "Downloaded/Total" diff --git a/download/download_wet.sh b/download/download_wet.sh new file mode 100755 index 0000000..73c1f43 --- /dev/null +++ b/download/download_wet.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e +set -o pipefail + +FILENAME=$(echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF)}') + +if [ ! -f ${FILENAME}.done ]; then + curl -s $1 | gzip -cd | \ + /fs/nas/heithrun0/commoncrawl/langsplit/bin/read_wet.py | \ + /fs/nas/heithrun0/commoncrawl/langsplit/bin/langsplit --printchunks 2> /dev/null | \ + xz -9 -e -T 2 > ${FILENAME}.langsplit.xz + touch ${FILENAME}.done +fi diff --git a/download/setup.sh b/download/setup.sh new file mode 100755 index 0000000..5541e3d --- /dev/null +++ b/download/setup.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -e +set -o pipefail + + +YEAR=$(echo $1 | awk ' BEGIN { FS = "_" } { print $1 }') +WEEK=$(echo $1 | awk ' BEGIN { FS = "_" } { print $2 }') + +# Make directory for specified crawl +mkdir -p ${1}/wet +cd ${1}/wet + +# Download path file +wget https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-${YEAR}-${WEEK}/wet.paths.gz + +# Convert to HTTPS URLs +gzip -cd wet.paths.gz | sed 's/^/https:\/\/commoncrawl.s3.amazonaws.com\//' > wet.paths.http + +# Make subdirectories +for f in `gzip -cd wet.paths.gz | cut -d '/' -f 4 | sort | uniq`; do mkdir -p $f; done; diff --git a/raw/README.md b/raw/README.md new file mode 100644 index 0000000..54e2f4e --- /dev/null +++ b/raw/README.md @@ -0,0 +1,25 @@ +# Creating .raw files + +## High-level description + +This pipeline takes the `*.langsplit.xz` files as input. Note that each crawl from CommonCrawl is usually split into 100 different shards. +However, this number is not necessarily consistent among all crawls (e.g. sometimes it might be 98). Each of those 100 different shards is in turn split into +several hundred files. For each of these files we have one `.langsplit.xz` file. + +The script `collect_monolingual.sh` takes as input the directory name of one shard and reads all the `.langsplit.xz` files in that directory and splits them +according to language. The second argument of this script is the output directory. For each language `collect_monolingual.sh` writes a files with the name +`text.${language}.gz` to the output directory. + +Now since `collect_monolingual.sh` is called on each of the 100 shards separately we still have to concatenate all the different `text.${language}.gz` files +into one big `${language}.raw.xz` file. This is done with the `create_raw.sh` script. There is a separate `create_raw_en.sh` since we want to create 100 raw files +for English because a single raw file for English would be too large. + +## Running the pipeline + +```bash +ls * | parallel ./collect_monolingual.sh {} {} +``` + +```bash +cat language.codes | parallel $crawl_dir $out_dir {} +``` diff --git a/raw/collect_langs.py b/raw/collect_langs.py new file mode 100755 index 0000000..513dabc --- /dev/null +++ b/raw/collect_langs.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import sys +import argparse + +magic_number = 'df6fa1abb58549287111ba8d776733e9' + +cld2_langcodes = ['en', 'da', 'nl', 'fi', 'fr', 'de', 'iw', 'it', + 'ja', 'ko', 'no', 'pl', 'pt', 'ru', 'es', 'sv', + 'zh', 'cs', 'el', 'is', 'lv', 'lt', 'ro', 'hu', + 'et', 'xxx', 'un', 'bg', 'hr', 'sr', 'ga', 'gl', + 'tl', 'tr', 'uk', 'hi', 'mk', 'bn', 'id', 'la', + 'ms', 'ml', 'cy', 'ne', 'te', 'sq', 'ta', 'be', + 'jw', 'oc', 'ur', 'bh', 'gu', 'th', 'ar', 'ca', + 'eo', 'eu', 'ia', 'kn', 'pa', 'gd', 'sw', 'sl', + 'mr', 'mt', 'vi', 'fy', 'sk', 'zh-Hant', 'fo', + 'su', 'uz', 'am', 'az', 'ka', 'ti', 'fa', 'bs', + 'si', 'nn', 'xh', 'zu', 'gn', 'st', 'tk', 'ky', + 'br', 'tw', 'yi', 'so', 'ug', 'ku', 'mn', 'hy', + 'lo', 'sd', 'rm', 'af', 'lb', 'my', 'km', 'bo', + 'dv', 'chr', 'syr', 'lif', 'or', 'as', 'co', + 'ie', 'kk', 'ln', 'mi', 'wo', 'ab', 'aa', 'ay', + 'ba', 'bi', 'dz', 'fj', 'kl', 'ha', 'ht', 'ik', + 'iu', 'ks', 'rw', 'mg', 'na', 'om', 'rn', 'sm', + 'sg', 'sa', 'ss', 'ts', 'tn', 'vo', 'za', 'kha', + 'sco', 'lg', 'gv', 'sr-ME', 'ak', 'ig', 'mfe', + 'haw', 'ceb', 'ee', 'gaa', 'blu', 'kri', 'loz', + 'lua', 'luo', 'new', 'ny', 'os', 'pam', 'nso', + 'raj', 'crs', 'tum', 've', 'war', 'nr', 'zzb', + 'zzp', 'zzh', 'tlh', 'zze', 'xx-Zyyy', 'xx-Latn', + 'xx-Grek', 'xx-Cyrl', 'xx-Armn', 'xx-Hebr', + 'xx-Arab', 'xx-Syrc', 'xx-Thaa', 'xx-Deva', + 'xx-Beng', 'xx-Guru', 'xx-Gujr', 'xx-Orya', + 'xx-Taml', 'xx-Telu', 'xx-Knda', 'xx-Mlym', + 'xx-Sinh', 'xx-Thai', 'xx-Laoo', 'xx-Tibt', + 'xx-Mymr', 'xx-Geor', 'xx-Hang', 'xx-Ethi', + 'xx-Cher', 'xx-Cans', 'xx-Ogam', 'xx-Runr', + 'xx-Khmr', 'xx-Mong', 'xx-Hira', 'xx-Kana', + 'xx-Bopo', 'xx-Hani', 'xx-Yiii', 'xx-Ital', + 'xx-Goth', 'xx-Dsrt', 'xx-Qaai', 'xx-Tglg', + 'xx-Hano', 'xx-Buhd', 'xx-Tagb', 'xx-Limb', + 'xx-Tale', 'xx-Linb', 'xx-Ugar', 'xx-Shaw', + 'xx-Osma', 'xx-Cprt', 'xx-Brai', 'xx-Bugi', + 'xx-Copt', 'xx-Talu', 'xx-Glag', 'xx-Tfng', + 'xx-Sylo', 'xx-Xpeo', 'xx-Khar', 'xx-Bali', + 'xx-Xsux', 'xx-Phnx', 'xx-Phag', 'xx-Nkoo', + 'xx-Sund', 'xx-Lepc', 'xx-Olck', 'xx-Vaii', + 'xx-Saur', 'xx-Kali', 'xx-Rjng', 'xx-Lyci', + 'xx-Cari', 'xx-Lydi', 'xx-Cham', 'xx-Lana', + 'xx-Tavt', 'xx-Avst', 'xx-Egyp', 'xx-Samr', + 'xx-Lisu', 'xx-Bamu', 'xx-Java', 'xx-Mtei', + 'xx-Armi', 'xx-Sarb', 'xx-Prti', 'xx-Phli', + 'xx-Orkh', 'xx-Kthi', 'xx-Batk', 'xx-Brah', + 'xx-Mand', 'xx-Cakm', 'xx-Merc', 'xx-Mero', + 'xx-Plrd', 'xx-Shrd', 'xx-Sora', 'xx-Takr'] +cld2_langcodes = [lc.replace('-', '_') for lc in cld2_langcodes] + +parser = argparse.ArgumentParser() +for lc in cld2_langcodes: + parser.add_argument("-%s" % lc, + help="outfile for %s data" % lc, + type=argparse.FileType('wb')) +args = parser.parse_args() + +lang2file = {} +for lc in cld2_langcodes: + if getattr(args, lc) is not None: + lang2file[lc] = getattr(args, lc) + + +buf = [] +current_lang = None + +for line in sys.stdin: + if line.startswith(magic_number): + if buf: + assert current_lang is not None + lang2file[current_lang].write("".join(buf)) + + current_lang = None + buf = [] + + for kv in line.strip().split(): + if kv.startswith("language:"): + lang = kv.split(':', 1)[1] + if lang in lang2file: + current_lang = lang + + if current_lang: + buf.append(line) + +if buf: + assert current_lang is not None + lang2file[current_lang].write("".join(buf)) + +for _, lang_file in lang2file.iteritems(): + lang_file.flush() + lang_file.close() diff --git a/raw/collect_monolingual.sh b/raw/collect_monolingual.sh new file mode 100755 index 0000000..b69d3aa --- /dev/null +++ b/raw/collect_monolingual.sh @@ -0,0 +1,295 @@ +#!/bin/bash + +# Exit as soon as any command fails +set -e +set -o pipefail + +BINDIR=/fs/freyja0/commoncrawl + +DATADIR=$1 +OUTDIR=$2 + +mkdir -p ${OUTDIR} + +DONEFILE=${OUTDIR}/langsplit.done + +if [ ! -f ${DONEFILE} ]; then + xzcat ${DATADIR}/*.langsplit.xz | ${BINDIR}/collect_langs.py \ + -en >(pigz -9 >${OUTDIR}/text.en.gz) \ + -da >(pigz -9 >${OUTDIR}/text.da.gz) \ + -nl >(pigz -9 >${OUTDIR}/text.nl.gz) \ + -fi >(pigz -9 >${OUTDIR}/text.fi.gz) \ + -fr >(pigz -9 >${OUTDIR}/text.fr.gz) \ + -de >(pigz -9 >${OUTDIR}/text.de.gz) \ + -iw >(pigz -9 >${OUTDIR}/text.iw.gz) \ + -it >(pigz -9 >${OUTDIR}/text.it.gz) \ + -ja >(pigz -9 >${OUTDIR}/text.ja.gz) \ + -ko >(pigz -9 >${OUTDIR}/text.ko.gz) \ + -no >(pigz -9 >${OUTDIR}/text.no.gz) \ + -pl >(pigz -9 >${OUTDIR}/text.pl.gz) \ + -pt >(pigz -9 >${OUTDIR}/text.pt.gz) \ + -ru >(pigz -9 >${OUTDIR}/text.ru.gz) \ + -es >(pigz -9 >${OUTDIR}/text.es.gz) \ + -sv >(pigz -9 >${OUTDIR}/text.sv.gz) \ + -zh >(pigz -9 >${OUTDIR}/text.zh.gz) \ + -cs >(pigz -9 >${OUTDIR}/text.cs.gz) \ + -el >(pigz -9 >${OUTDIR}/text.el.gz) \ + -is >(pigz -9 >${OUTDIR}/text.is.gz) \ + -lv >(pigz -9 >${OUTDIR}/text.lv.gz) \ + -lt >(pigz -9 >${OUTDIR}/text.lt.gz) \ + -ro >(pigz -9 >${OUTDIR}/text.ro.gz) \ + -hu >(pigz -9 >${OUTDIR}/text.hu.gz) \ + -et >(pigz -9 >${OUTDIR}/text.et.gz) \ + -xxx >(pigz -9 >${OUTDIR}/text.xxx.gz) \ + -un >(pigz -9 >${OUTDIR}/text.un.gz) \ + -bg >(pigz -9 >${OUTDIR}/text.bg.gz) \ + -hr >(pigz -9 >${OUTDIR}/text.hr.gz) \ + -sr >(pigz -9 >${OUTDIR}/text.sr.gz) \ + -ga >(pigz -9 >${OUTDIR}/text.ga.gz) \ + -gl >(pigz -9 >${OUTDIR}/text.gl.gz) \ + -tl >(pigz -9 >${OUTDIR}/text.tl.gz) \ + -tr >(pigz -9 >${OUTDIR}/text.tr.gz) \ + -uk >(pigz -9 >${OUTDIR}/text.uk.gz) \ + -hi >(pigz -9 >${OUTDIR}/text.hi.gz) \ + -mk >(pigz -9 >${OUTDIR}/text.mk.gz) \ + -bn >(pigz -9 >${OUTDIR}/text.bn.gz) \ + -id >(pigz -9 >${OUTDIR}/text.id.gz) \ + -la >(pigz -9 >${OUTDIR}/text.la.gz) \ + -ms >(pigz -9 >${OUTDIR}/text.ms.gz) \ + -ml >(pigz -9 >${OUTDIR}/text.ml.gz) \ + -cy >(pigz -9 >${OUTDIR}/text.cy.gz) \ + -ne >(pigz -9 >${OUTDIR}/text.ne.gz) \ + -te >(pigz -9 >${OUTDIR}/text.te.gz) \ + -sq >(pigz -9 >${OUTDIR}/text.sq.gz) \ + -ta >(pigz -9 >${OUTDIR}/text.ta.gz) \ + -be >(pigz -9 >${OUTDIR}/text.be.gz) \ + -jw >(pigz -9 >${OUTDIR}/text.jw.gz) \ + -oc >(pigz -9 >${OUTDIR}/text.oc.gz) \ + -ur >(pigz -9 >${OUTDIR}/text.ur.gz) \ + -bh >(pigz -9 >${OUTDIR}/text.bh.gz) \ + -gu >(pigz -9 >${OUTDIR}/text.gu.gz) \ + -th >(pigz -9 >${OUTDIR}/text.th.gz) \ + -ar >(pigz -9 >${OUTDIR}/text.ar.gz) \ + -ca >(pigz -9 >${OUTDIR}/text.ca.gz) \ + -eo >(pigz -9 >${OUTDIR}/text.eo.gz) \ + -eu >(pigz -9 >${OUTDIR}/text.eu.gz) \ + -ia >(pigz -9 >${OUTDIR}/text.ia.gz) \ + -kn >(pigz -9 >${OUTDIR}/text.kn.gz) \ + -pa >(pigz -9 >${OUTDIR}/text.pa.gz) \ + -gd >(pigz -9 >${OUTDIR}/text.gd.gz) \ + -sw >(pigz -9 >${OUTDIR}/text.sw.gz) \ + -sl >(pigz -9 >${OUTDIR}/text.sl.gz) \ + -mr >(pigz -9 >${OUTDIR}/text.mr.gz) \ + -mt >(pigz -9 >${OUTDIR}/text.mt.gz) \ + -vi >(pigz -9 >${OUTDIR}/text.vi.gz) \ + -fy >(pigz -9 >${OUTDIR}/text.fy.gz) \ + -sk >(pigz -9 >${OUTDIR}/text.sk.gz) \ + -zh_Hant >(pigz -9 >${OUTDIR}/text.zh-Hant.gz) \ + -fo >(pigz -9 >${OUTDIR}/text.fo.gz) \ + -su >(pigz -9 >${OUTDIR}/text.su.gz) \ + -uz >(pigz -9 >${OUTDIR}/text.uz.gz) \ + -am >(pigz -9 >${OUTDIR}/text.am.gz) \ + -az >(pigz -9 >${OUTDIR}/text.az.gz) \ + -ka >(pigz -9 >${OUTDIR}/text.ka.gz) \ + -ti >(pigz -9 >${OUTDIR}/text.ti.gz) \ + -fa >(pigz -9 >${OUTDIR}/text.fa.gz) \ + -bs >(pigz -9 >${OUTDIR}/text.bs.gz) \ + -si >(pigz -9 >${OUTDIR}/text.si.gz) \ + -nn >(pigz -9 >${OUTDIR}/text.nn.gz) \ + -xh >(pigz -9 >${OUTDIR}/text.xh.gz) \ + -zu >(pigz -9 >${OUTDIR}/text.zu.gz) \ + -gn >(pigz -9 >${OUTDIR}/text.gn.gz) \ + -st >(pigz -9 >${OUTDIR}/text.st.gz) \ + -tk >(pigz -9 >${OUTDIR}/text.tk.gz) \ + -ky >(pigz -9 >${OUTDIR}/text.ky.gz) \ + -br >(pigz -9 >${OUTDIR}/text.br.gz) \ + -tw >(pigz -9 >${OUTDIR}/text.tw.gz) \ + -yi >(pigz -9 >${OUTDIR}/text.yi.gz) \ + -so >(pigz -9 >${OUTDIR}/text.so.gz) \ + -ug >(pigz -9 >${OUTDIR}/text.ug.gz) \ + -ku >(pigz -9 >${OUTDIR}/text.ku.gz) \ + -mn >(pigz -9 >${OUTDIR}/text.mn.gz) \ + -hy >(pigz -9 >${OUTDIR}/text.hy.gz) \ + -lo >(pigz -9 >${OUTDIR}/text.lo.gz) \ + -sd >(pigz -9 >${OUTDIR}/text.sd.gz) \ + -rm >(pigz -9 >${OUTDIR}/text.rm.gz) \ + -af >(pigz -9 >${OUTDIR}/text.af.gz) \ + -lb >(pigz -9 >${OUTDIR}/text.lb.gz) \ + -my >(pigz -9 >${OUTDIR}/text.my.gz) \ + -km >(pigz -9 >${OUTDIR}/text.km.gz) \ + -bo >(pigz -9 >${OUTDIR}/text.bo.gz) \ + -dv >(pigz -9 >${OUTDIR}/text.dv.gz) \ + -chr >(pigz -9 >${OUTDIR}/text.chr.gz) \ + -syr >(pigz -9 >${OUTDIR}/text.syr.gz) \ + -lif >(pigz -9 >${OUTDIR}/text.lif.gz) \ + -or >(pigz -9 >${OUTDIR}/text.or.gz) \ + -as >(pigz -9 >${OUTDIR}/text.as.gz) \ + -co >(pigz -9 >${OUTDIR}/text.co.gz) \ + -ie >(pigz -9 >${OUTDIR}/text.ie.gz) \ + -kk >(pigz -9 >${OUTDIR}/text.kk.gz) \ + -ln >(pigz -9 >${OUTDIR}/text.ln.gz) \ + -mi >(pigz -9 >${OUTDIR}/text.mi.gz) \ + -wo >(pigz -9 >${OUTDIR}/text.wo.gz) \ + -ab >(pigz -9 >${OUTDIR}/text.ab.gz) \ + -aa >(pigz -9 >${OUTDIR}/text.aa.gz) \ + -ay >(pigz -9 >${OUTDIR}/text.ay.gz) \ + -ba >(pigz -9 >${OUTDIR}/text.ba.gz) \ + -bi >(pigz -9 >${OUTDIR}/text.bi.gz) \ + -dz >(pigz -9 >${OUTDIR}/text.dz.gz) \ + -fj >(pigz -9 >${OUTDIR}/text.fj.gz) \ + -kl >(pigz -9 >${OUTDIR}/text.kl.gz) \ + -ha >(pigz -9 >${OUTDIR}/text.ha.gz) \ + -ht >(pigz -9 >${OUTDIR}/text.ht.gz) \ + -ik >(pigz -9 >${OUTDIR}/text.ik.gz) \ + -iu >(pigz -9 >${OUTDIR}/text.iu.gz) \ + -ks >(pigz -9 >${OUTDIR}/text.ks.gz) \ + -rw >(pigz -9 >${OUTDIR}/text.rw.gz) \ + -mg >(pigz -9 >${OUTDIR}/text.mg.gz) \ + -na >(pigz -9 >${OUTDIR}/text.na.gz) \ + -om >(pigz -9 >${OUTDIR}/text.om.gz) \ + -rn >(pigz -9 >${OUTDIR}/text.rn.gz) \ + -sm >(pigz -9 >${OUTDIR}/text.sm.gz) \ + -sg >(pigz -9 >${OUTDIR}/text.sg.gz) \ + -sa >(pigz -9 >${OUTDIR}/text.sa.gz) \ + -ss >(pigz -9 >${OUTDIR}/text.ss.gz) \ + -ts >(pigz -9 >${OUTDIR}/text.ts.gz) \ + -tn >(pigz -9 >${OUTDIR}/text.tn.gz) \ + -vo >(pigz -9 >${OUTDIR}/text.vo.gz) \ + -za >(pigz -9 >${OUTDIR}/text.za.gz) \ + -kha >(pigz -9 >${OUTDIR}/text.kha.gz) \ + -sco >(pigz -9 >${OUTDIR}/text.sco.gz) \ + -lg >(pigz -9 >${OUTDIR}/text.lg.gz) \ + -gv >(pigz -9 >${OUTDIR}/text.gv.gz) \ + -sr_ME >(pigz -9 >${OUTDIR}/text.sr-ME.gz) \ + -ak >(pigz -9 >${OUTDIR}/text.ak.gz) \ + -ig >(pigz -9 >${OUTDIR}/text.ig.gz) \ + -mfe >(pigz -9 >${OUTDIR}/text.mfe.gz) \ + -haw >(pigz -9 >${OUTDIR}/text.haw.gz) \ + -ceb >(pigz -9 >${OUTDIR}/text.ceb.gz) \ + -ee >(pigz -9 >${OUTDIR}/text.ee.gz) \ + -gaa >(pigz -9 >${OUTDIR}/text.gaa.gz) \ + -blu >(pigz -9 >${OUTDIR}/text.blu.gz) \ + -kri >(pigz -9 >${OUTDIR}/text.kri.gz) \ + -loz >(pigz -9 >${OUTDIR}/text.loz.gz) \ + -lua >(pigz -9 >${OUTDIR}/text.lua.gz) \ + -luo >(pigz -9 >${OUTDIR}/text.luo.gz) \ + -new >(pigz -9 >${OUTDIR}/text.new.gz) \ + -ny >(pigz -9 >${OUTDIR}/text.ny.gz) \ + -os >(pigz -9 >${OUTDIR}/text.os.gz) \ + -pam >(pigz -9 >${OUTDIR}/text.pam.gz) \ + -nso >(pigz -9 >${OUTDIR}/text.nso.gz) \ + -raj >(pigz -9 >${OUTDIR}/text.raj.gz) \ + -crs >(pigz -9 >${OUTDIR}/text.crs.gz) \ + -tum >(pigz -9 >${OUTDIR}/text.tum.gz) \ + -ve >(pigz -9 >${OUTDIR}/text.ve.gz) \ + -war >(pigz -9 >${OUTDIR}/text.war.gz) \ + -nr >(pigz -9 >${OUTDIR}/text.nr.gz) \ + -zzb >(pigz -9 >${OUTDIR}/text.zzb.gz) \ + -zzp >(pigz -9 >${OUTDIR}/text.zzp.gz) \ + -zzh >(pigz -9 >${OUTDIR}/text.zzh.gz) \ + -tlh >(pigz -9 >${OUTDIR}/text.tlh.gz) \ + -zze >(pigz -9 >${OUTDIR}/text.zze.gz) \ + -xx_Zyyy >(pigz -9 >${OUTDIR}/text.xx-Zyyy.gz) \ + -xx_Latn >(pigz -9 >${OUTDIR}/text.xx-Latn.gz) \ + -xx_Grek >(pigz -9 >${OUTDIR}/text.xx-Grek.gz) \ + -xx_Cyrl >(pigz -9 >${OUTDIR}/text.xx-Cyrl.gz) \ + -xx_Armn >(pigz -9 >${OUTDIR}/text.xx-Armn.gz) \ + -xx_Hebr >(pigz -9 >${OUTDIR}/text.xx-Hebr.gz) \ + -xx_Arab >(pigz -9 >${OUTDIR}/text.xx-Arab.gz) \ + -xx_Syrc >(pigz -9 >${OUTDIR}/text.xx-Syrc.gz) \ + -xx_Thaa >(pigz -9 >${OUTDIR}/text.xx-Thaa.gz) \ + -xx_Deva >(pigz -9 >${OUTDIR}/text.xx-Deva.gz) \ + -xx_Beng >(pigz -9 >${OUTDIR}/text.xx-Beng.gz) \ + -xx_Guru >(pigz -9 >${OUTDIR}/text.xx-Guru.gz) \ + -xx_Gujr >(pigz -9 >${OUTDIR}/text.xx-Gujr.gz) \ + -xx_Orya >(pigz -9 >${OUTDIR}/text.xx-Orya.gz) \ + -xx_Taml >(pigz -9 >${OUTDIR}/text.xx-Taml.gz) \ + -xx_Telu >(pigz -9 >${OUTDIR}/text.xx-Telu.gz) \ + -xx_Knda >(pigz -9 >${OUTDIR}/text.xx-Knda.gz) \ + -xx_Mlym >(pigz -9 >${OUTDIR}/text.xx-Mlym.gz) \ + -xx_Sinh >(pigz -9 >${OUTDIR}/text.xx-Sinh.gz) \ + -xx_Thai >(pigz -9 >${OUTDIR}/text.xx-Thai.gz) \ + -xx_Laoo >(pigz -9 >${OUTDIR}/text.xx-Laoo.gz) \ + -xx_Tibt >(pigz -9 >${OUTDIR}/text.xx-Tibt.gz) \ + -xx_Mymr >(pigz -9 >${OUTDIR}/text.xx-Mymr.gz) \ + -xx_Geor >(pigz -9 >${OUTDIR}/text.xx-Geor.gz) \ + -xx_Hang >(pigz -9 >${OUTDIR}/text.xx-Hang.gz) \ + -xx_Ethi >(pigz -9 >${OUTDIR}/text.xx-Ethi.gz) \ + -xx_Cher >(pigz -9 >${OUTDIR}/text.xx-Cher.gz) \ + -xx_Cans >(pigz -9 >${OUTDIR}/text.xx-Cans.gz) \ + -xx_Ogam >(pigz -9 >${OUTDIR}/text.xx-Ogam.gz) \ + -xx_Runr >(pigz -9 >${OUTDIR}/text.xx-Runr.gz) \ + -xx_Khmr >(pigz -9 >${OUTDIR}/text.xx-Khmr.gz) \ + -xx_Mong >(pigz -9 >${OUTDIR}/text.xx-Mong.gz) \ + -xx_Hira >(pigz -9 >${OUTDIR}/text.xx-Hira.gz) \ + -xx_Kana >(pigz -9 >${OUTDIR}/text.xx-Kana.gz) \ + -xx_Bopo >(pigz -9 >${OUTDIR}/text.xx-Bopo.gz) \ + -xx_Hani >(pigz -9 >${OUTDIR}/text.xx-Hani.gz) \ + -xx_Yiii >(pigz -9 >${OUTDIR}/text.xx-Yiii.gz) \ + -xx_Ital >(pigz -9 >${OUTDIR}/text.xx-Ital.gz) \ + -xx_Goth >(pigz -9 >${OUTDIR}/text.xx-Goth.gz) \ + -xx_Dsrt >(pigz -9 >${OUTDIR}/text.xx-Dsrt.gz) \ + -xx_Qaai >(pigz -9 >${OUTDIR}/text.xx-Qaai.gz) \ + -xx_Tglg >(pigz -9 >${OUTDIR}/text.xx-Tglg.gz) \ + -xx_Hano >(pigz -9 >${OUTDIR}/text.xx-Hano.gz) \ + -xx_Buhd >(pigz -9 >${OUTDIR}/text.xx-Buhd.gz) \ + -xx_Tagb >(pigz -9 >${OUTDIR}/text.xx-Tagb.gz) \ + -xx_Limb >(pigz -9 >${OUTDIR}/text.xx-Limb.gz) \ + -xx_Tale >(pigz -9 >${OUTDIR}/text.xx-Tale.gz) \ + -xx_Linb >(pigz -9 >${OUTDIR}/text.xx-Linb.gz) \ + -xx_Ugar >(pigz -9 >${OUTDIR}/text.xx-Ugar.gz) \ + -xx_Shaw >(pigz -9 >${OUTDIR}/text.xx-Shaw.gz) \ + -xx_Osma >(pigz -9 >${OUTDIR}/text.xx-Osma.gz) \ + -xx_Cprt >(pigz -9 >${OUTDIR}/text.xx-Cprt.gz) \ + -xx_Brai >(pigz -9 >${OUTDIR}/text.xx-Brai.gz) \ + -xx_Bugi >(pigz -9 >${OUTDIR}/text.xx-Bugi.gz) \ + -xx_Copt >(pigz -9 >${OUTDIR}/text.xx-Copt.gz) \ + -xx_Talu >(pigz -9 >${OUTDIR}/text.xx-Talu.gz) \ + -xx_Glag >(pigz -9 >${OUTDIR}/text.xx-Glag.gz) \ + -xx_Tfng >(pigz -9 >${OUTDIR}/text.xx-Tfng.gz) \ + -xx_Sylo >(pigz -9 >${OUTDIR}/text.xx-Sylo.gz) \ + -xx_Xpeo >(pigz -9 >${OUTDIR}/text.xx-Xpeo.gz) \ + -xx_Khar >(pigz -9 >${OUTDIR}/text.xx-Khar.gz) \ + -xx_Bali >(pigz -9 >${OUTDIR}/text.xx-Bali.gz) \ + -xx_Xsux >(pigz -9 >${OUTDIR}/text.xx-Xsux.gz) \ + -xx_Phnx >(pigz -9 >${OUTDIR}/text.xx-Phnx.gz) \ + -xx_Phag >(pigz -9 >${OUTDIR}/text.xx-Phag.gz) \ + -xx_Nkoo >(pigz -9 >${OUTDIR}/text.xx-Nkoo.gz) \ + -xx_Sund >(pigz -9 >${OUTDIR}/text.xx-Sund.gz) \ + -xx_Lepc >(pigz -9 >${OUTDIR}/text.xx-Lepc.gz) \ + -xx_Olck >(pigz -9 >${OUTDIR}/text.xx-Olck.gz) \ + -xx_Vaii >(pigz -9 >${OUTDIR}/text.xx-Vaii.gz) \ + -xx_Saur >(pigz -9 >${OUTDIR}/text.xx-Saur.gz) \ + -xx_Kali >(pigz -9 >${OUTDIR}/text.xx-Kali.gz) \ + -xx_Rjng >(pigz -9 >${OUTDIR}/text.xx-Rjng.gz) \ + -xx_Lyci >(pigz -9 >${OUTDIR}/text.xx-Lyci.gz) \ + -xx_Cari >(pigz -9 >${OUTDIR}/text.xx-Cari.gz) \ + -xx_Lydi >(pigz -9 >${OUTDIR}/text.xx-Lydi.gz) \ + -xx_Cham >(pigz -9 >${OUTDIR}/text.xx-Cham.gz) \ + -xx_Lana >(pigz -9 >${OUTDIR}/text.xx-Lana.gz) \ + -xx_Tavt >(pigz -9 >${OUTDIR}/text.xx-Tavt.gz) \ + -xx_Avst >(pigz -9 >${OUTDIR}/text.xx-Avst.gz) \ + -xx_Egyp >(pigz -9 >${OUTDIR}/text.xx-Egyp.gz) \ + -xx_Samr >(pigz -9 >${OUTDIR}/text.xx-Samr.gz) \ + -xx_Lisu >(pigz -9 >${OUTDIR}/text.xx-Lisu.gz) \ + -xx_Bamu >(pigz -9 >${OUTDIR}/text.xx-Bamu.gz) \ + -xx_Java >(pigz -9 >${OUTDIR}/text.xx-Java.gz) \ + -xx_Mtei >(pigz -9 >${OUTDIR}/text.xx-Mtei.gz) \ + -xx_Armi >(pigz -9 >${OUTDIR}/text.xx-Armi.gz) \ + -xx_Sarb >(pigz -9 >${OUTDIR}/text.xx-Sarb.gz) \ + -xx_Prti >(pigz -9 >${OUTDIR}/text.xx-Prti.gz) \ + -xx_Phli >(pigz -9 >${OUTDIR}/text.xx-Phli.gz) \ + -xx_Orkh >(pigz -9 >${OUTDIR}/text.xx-Orkh.gz) \ + -xx_Kthi >(pigz -9 >${OUTDIR}/text.xx-Kthi.gz) \ + -xx_Batk >(pigz -9 >${OUTDIR}/text.xx-Batk.gz) \ + -xx_Brah >(pigz -9 >${OUTDIR}/text.xx-Brah.gz) \ + -xx_Mand >(pigz -9 >${OUTDIR}/text.xx-Mand.gz) \ + -xx_Cakm >(pigz -9 >${OUTDIR}/text.xx-Cakm.gz) \ + -xx_Merc >(pigz -9 >${OUTDIR}/text.xx-Merc.gz) \ + -xx_Mero >(pigz -9 >${OUTDIR}/text.xx-Mero.gz) \ + -xx_Plrd >(pigz -9 >${OUTDIR}/text.xx-Plrd.gz) \ + -xx_Shrd >(pigz -9 >${OUTDIR}/text.xx-Shrd.gz) \ + -xx_Sora >(pigz -9 >${OUTDIR}/text.xx-Sora.gz) \ + -xx_Takr >(pigz -9 >${OUTDIR}/text.xx-Takr.gz) + touch ${DONEFILE} +fi diff --git a/raw/create_raw.sh b/raw/create_raw.sh new file mode 100755 index 0000000..4ce8acb --- /dev/null +++ b/raw/create_raw.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -e +set -o pipefail + +# Parallel command: +# cat langugage.codes | parallel --nice 19 --progress --sshloginfile file \ +# create_raw.sh $crawl_dir $out_dir {} + +INDIR=$1 +OUTDIR=$2 +LANGCODE=$3 + +OUTFILE=${OUTDIR}/${LANGCODE}.raw.xz +DONEFILE=${OUTFILE}.done + +unsafe_gunzip() { + # unsafe_gunzip makes it possible to open several .gz files which are corrupted. + # In our case many .gz files fail with a an "Unexpected end of file" error. + set +e + set +o pipefail + for file in "$@"; do + gzip -cd "$file" 2> /dev/null + echo + done + set -o pipefail + set -e +} + +if [[ -f ${DONEFILE} ]]; then + exit 0 +fi + +unsafe_gunzip ${INDIR}/*/text.${LANGCODE}.gz | xz -c > "${OUTFILE}" + +touch "${DONEFILE}" diff --git a/raw/create_raw_en.sh b/raw/create_raw_en.sh new file mode 100755 index 0000000..3fa856d --- /dev/null +++ b/raw/create_raw_en.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +set -e +set -o pipefail + +SOURCEFILE=$1 +DESTINATION=$2 +INDEX=$3 +PADDEDINDEX=$(printf %02d $((INDEX - 1))) +NEWFILE=${DESTINATION}/en.${PADDEDINDEX}.raw.xz +DONEFILE=${NEWFILE}.done +if [[ -f ${DONEFILE} ]]; then + exit 0 +fi + +unsafe_gunzip() { + # unsafe_gunzip makes it possible to open several .gz files which are corrupted. + # In our case many .gz files fail with a an "Unexpected end of file" error. + set +e + set +o pipefail + for file in "$@"; do + gzip -cd "$file" 2> /dev/null + echo + done + set -o pipefail + set -e +} + +unsafe_gunzip "${SOURCEFILE}" | xz -c > "${NEWFILE}" + +touch "${DONEFILE}" diff --git a/s3/FILES.md b/s3/FILES.md new file mode 100644 index 0000000..1a2ff0d --- /dev/null +++ b/s3/FILES.md @@ -0,0 +1,14 @@ +# S3 File Structure + +In general we have a seperate directory for each language, which in turn contains up to three subdirectories: +``` +s3://web-language-models/ngrams/${lang}/deduped +s3://web-language-models/ngrams/${lang}/raw +s3://web-language-models/ngrams/${lang}/lm +``` + +The deduped folder contains the deduped file of that language with the corresponding offset file. The raw folder contains the `.raw` files for each individual crawl. The lm fodler contains the language model, if there exists one. + +## Irregularities + +The English language model is located at `s3://web-language-models/ngrams/lm/en.trie.xz` which is an artifact of the old file structure. All attempts to copy the language model on AWS failed due to the size of the model. We might need to reupload it if we want to change its location. diff --git a/s3/check_deduped_en.sh b/s3/check_deduped_en.sh new file mode 100755 index 0000000..56b5522 --- /dev/null +++ b/s3/check_deduped_en.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e +set -o pipefail + +for i in $(seq -f "%02g" 0 99); do + SOURCEFILE="s3://web-language-models/ngrams/deduped/en/en.${i}.deduped.xz" + TARGETFILE="s3://web-language-models/ngrams/en/deduped/en.${i}.deduped.xz" + SOURCESIZE=$(s3cmd ls ${SOURCEFILE} | cut -d ' ' -f 3) + TARGETSIZE=$(s3cmd ls ${TARGETFILE} | cut -d ' ' -f 3) + if [[ ! ${SOURCESIZE} -eq ${TARGETSIZE} ]]; then + echo "Mismatch on file ${SOURCEFILE}" + fi +done diff --git a/s3/check_raw_uploads.sh b/s3/check_raw_uploads.sh new file mode 100755 index 0000000..b32d576 --- /dev/null +++ b/s3/check_raw_uploads.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e +set -o pipefail + +FILE=$(echo $1 | awk ' BEGIN { FS = "/" } { print $(NF) }') + +BUCKET="s3://web-language-models/ngrams/en/raw/${FILE}" + +MD5SUM1=$(echo $(/home/tim/bin/s3cmd/s3cmd ls --list-md5 ${BUCKET}) | awk 'BEGIN { FS = " " } {print $(NF-1)}') +MD5SUM2=$(md5sum $1 | awk 'BEGIN { FS = " " } {print $1}') +if [ ${MD5SUM1} != ${MD5SUM2} ]; then + echo "$1" +fi diff --git a/s3/check_raw_uploads_en.sh b/s3/check_raw_uploads_en.sh new file mode 100644 index 0000000..ef7029a --- /dev/null +++ b/s3/check_raw_uploads_en.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +FILE=$(echo "$1" | awk ' BEGIN { FS = "/" } { print $(NF) }') +OUTFILE="$2" + +MD5SUM=$(md5sum "$1" | awk 'BEGIN { FS = " " } {print $1}') +echo "${MD5SUM} ${FILE}" >> "${OUTFILE}" diff --git a/s3/continue_multipart.sh b/s3/continue_multipart.sh new file mode 100755 index 0000000..50448c8 --- /dev/null +++ b/s3/continue_multipart.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e +set -o pipefail + +s3cmd multipart s3://web-language-models/ngrams/raw_en/ | \ + grep -o "en.[^/]*.xz" | \ + parallel --nice 19 --progress -j 8 s3cmd put --continue-put {} s3://web-language-models/ngrams/en/raw/{} diff --git a/s3/copy_deduped.py b/s3/copy_deduped.py new file mode 100755 index 0000000..14f9158 --- /dev/null +++ b/s3/copy_deduped.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +import sys +import boto3 +from boto3.s3.transfer import TransferConfig + +for line in sys.stdin: + link = line.split()[-1] + if ".xz" in link: + lang = link.split('/')[-1].split('.')[0] + sourcekey = "ngrams/deduped/{lang}.deduped.xz".format(lang=lang) + targetkey = "ngrams/{lang}/deduped/{lang}.deduped.xz".format(lang=lang) + + print("Copy from {} to {}..".format(sourcekey, targetkey)) + chunksize = 1000 * 1000000 + transferConfig = TransferConfig(multipart_threshold=chunksize, multipart_chunksize=chunksize) + + s3 = boto3.resource('s3') + copy_source = { + 'Bucket': 'web-language-models', + 'Key': sourcekey + } + s3.meta.client.copy(copy_source, 'web-language-models', targetkey, Config=transferConfig) + diff --git a/s3/copy_deduped_en.py b/s3/copy_deduped_en.py new file mode 100755 index 0000000..e7d7ea6 --- /dev/null +++ b/s3/copy_deduped_en.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +import sys +import boto3 +from boto3.s3.transfer import TransferConfig + +for line in sys.stdin: + link = line.split()[-1] + if ".xz" in link: + index = link.split('/')[-1].split('.')[1] + sourcekey = "ngrams/deduped/en/en.{index}.deduped.xz".format(index=index) + targetkey = "ngrams/en/deduped/en.{index}.deduped.xz".format(index=index) + + print("Copy from {} to {}..".format(sourcekey, targetkey)) + chunksize = 1000 * 1000000 + transferConfig = TransferConfig(multipart_threshold=chunksize, multipart_chunksize=chunksize) + + s3 = boto3.resource('s3') + copy_source = { + 'Bucket': 'web-language-models', + 'Key': sourcekey + } + s3.meta.client.copy(copy_source, 'web-language-models', targetkey, Config=transferConfig) + diff --git a/s3/count_uploads.sh b/s3/count_uploads.sh new file mode 100755 index 0000000..02b4a28 --- /dev/null +++ b/s3/count_uploads.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +TOTAL=0 +UPLOADED=0 +echo -n "${TOTAL}" +for filepath in /fs/vali0/www/data.statmt.org/ngrams/raw/*.xz; do + TOTAL=$((TOTAL+1)) + echo -en "\e[1A"; echo -e "\e[0K\r ${TOTAL}" + FILE=$(echo $filepath | awk ' BEGIN { FS = "/" } { print $(NF) }') + LANGUAGE=$(echo ${FILE} | awk ' BEGIN { FS = "." } { print $1 }') + YEAR=$(echo ${FILE} | awk ' BEGIN {FS = "." } { print $2 }') + VERSION="00" + + NEW_FILENAME=$(echo ${FILE} | sed "s/[0-9_]\{1,\}/${YEAR}.${VERSION}/") + BUCKET="s3://web-language-models/ngrams/${LANGUAGE}/raw/${NEW_FILENAME}" + + # Increase counter if file already exists. + if [[ ! -z $(/home/tim/bin/s3cmd/s3cmd ls ${BUCKET}) ]]; then + UPLOADED=$((UPLOADED+1)) + fi +done + +echo -en "\e[1A"; echo -e "\e[0K\r ${UPLOADED}/${TOTAL}" diff --git a/s3/create_md5_sums.sh b/s3/create_md5_sums.sh new file mode 100755 index 0000000..8fb0f20 --- /dev/null +++ b/s3/create_md5_sums.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +FILE=$(echo "$1" | awk ' BEGIN { FS = "/" } { print $(NF) }') +OUTFILE="$2" + +MD5SUM=$(md5sum $1 | awk 'BEGIN { FS = " " } {print $1}') +echo "${MD5SUM} ${FILE}" >> "${OUTFILE}" diff --git a/s3/index_files/index_lm.html b/s3/index_files/index_lm.html new file mode 100644 index 0000000..4fddcb1 --- /dev/null +++ b/s3/index_files/index_lm.html @@ -0,0 +1,18 @@ + + + + Language Models + + + +

Language Models

+ + + diff --git a/s3/index_files/index_raw.html b/s3/index_files/index_raw.html new file mode 100644 index 0000000..6464843 --- /dev/null +++ b/s3/index_files/index_raw.html @@ -0,0 +1,926 @@ + + + + Raw CommonCrawl + + + +

Raw CommonCrawl

+ + + diff --git a/s3/index_files/raw_links b/s3/index_files/raw_links new file mode 100644 index 0000000..2aaae22 --- /dev/null +++ b/s3/index_files/raw_links @@ -0,0 +1,914 @@ +
  • aa.2012.00.raw.xz
  • +
  • aa.2013_1.00.raw.xz
  • +
  • aa.2013_2.00.raw.xz
  • +
  • aa.2014_1.00.raw.xz
  • +
  • ab.2012.00.raw.xz
  • +
  • ab.2013_1.00.raw.xz
  • +
  • ab.2013_2.00.raw.xz
  • +
  • ab.2014_1.00.raw.xz
  • +
  • af.2012.00.raw.xz
  • +
  • af.2013_1.00.raw.xz
  • +
  • af.2013_2.00.raw.xz
  • +
  • af.2014_1.00.raw.xz
  • +
  • ak.2012.00.raw.xz
  • +
  • ak.2013_1.00.raw.xz
  • +
  • ak.2013_2.00.raw.xz
  • +
  • ak.2014_1.00.raw.xz
  • +
  • am.2012.00.raw.xz
  • +
  • am.2013_1.00.raw.xz
  • +
  • am.2013_2.00.raw.xz
  • +
  • am.2014_1.00.raw.xz
  • +
  • ar.2012.00.raw.xz
  • +
  • ar.2013_1.00.raw.xz
  • +
  • ar.2013_2.00.raw.xz
  • +
  • ar.2014_1.00.raw.xz
  • +
  • as.2012.00.raw.xz
  • +
  • as.2013_1.00.raw.xz
  • +
  • as.2013_2.00.raw.xz
  • +
  • as.2014_1.00.raw.xz
  • +
  • ay.2012.00.raw.xz
  • +
  • ay.2013_1.00.raw.xz
  • +
  • ay.2013_2.00.raw.xz
  • +
  • ay.2014_1.00.raw.xz
  • +
  • az.2012.00.raw.xz
  • +
  • az.2013_1.00.raw.xz
  • +
  • az.2013_2.00.raw.xz
  • +
  • az.2014_1.00.raw.xz
  • +
  • ba.2012.00.raw.xz
  • +
  • ba.2013_1.00.raw.xz
  • +
  • ba.2013_2.00.raw.xz
  • +
  • ba.2014_1.00.raw.xz
  • +
  • be.2012.00.raw.xz
  • +
  • be.2013_1.00.raw.xz
  • +
  • be.2013_2.00.raw.xz
  • +
  • be.2014_1.00.raw.xz
  • +
  • bg.2012.00.raw.xz
  • +
  • bg.2013_1.00.raw.xz
  • +
  • bg.2013_2.00.raw.xz
  • +
  • bg.2014_1.00.raw.xz
  • +
  • bh.2012.00.raw.xz
  • +
  • bh.2013_1.00.raw.xz
  • +
  • bh.2013_2.00.raw.xz
  • +
  • bh.2014_1.00.raw.xz
  • +
  • bi.2012.00.raw.xz
  • +
  • bi.2013_1.00.raw.xz
  • +
  • bi.2013_2.00.raw.xz
  • +
  • bi.2014_1.00.raw.xz
  • +
  • blu.2012.00.raw.xz
  • +
  • blu.2013_1.00.raw.xz
  • +
  • blu.2013_2.00.raw.xz
  • +
  • blu.2014_1.00.raw.xz
  • +
  • bn.2012.00.raw.xz
  • +
  • bn.2013_1.00.raw.xz
  • +
  • bn.2013_2.00.raw.xz
  • +
  • bn.2014_1.00.raw.xz
  • +
  • bo.2012.00.raw.xz
  • +
  • bo.2013_1.00.raw.xz
  • +
  • bo.2013_2.00.raw.xz
  • +
  • bo.2014_1.00.raw.xz
  • +
  • br.2012.00.raw.xz
  • +
  • br.2013_1.00.raw.xz
  • +
  • br.2013_2.00.raw.xz
  • +
  • br.2014_1.00.raw.xz
  • +
  • bs.2012.00.raw.xz
  • +
  • bs.2013_1.00.raw.xz
  • +
  • bs.2013_2.00.raw.xz
  • +
  • bs.2014_1.00.raw.xz
  • +
  • ca.2012.00.raw.xz
  • +
  • ca.2013_1.00.raw.xz
  • +
  • ca.2013_2.00.raw.xz
  • +
  • ca.2014_1.00.raw.xz
  • +
  • ceb.2012.00.raw.xz
  • +
  • ceb.2013_1.00.raw.xz
  • +
  • ceb.2013_2.00.raw.xz
  • +
  • ceb.2014_1.00.raw.xz
  • +
  • chr.2012.00.raw.xz
  • +
  • chr.2013_1.00.raw.xz
  • +
  • chr.2013_2.00.raw.xz
  • +
  • chr.2014_1.00.raw.xz
  • +
  • co.2012.00.raw.xz
  • +
  • co.2013_1.00.raw.xz
  • +
  • co.2013_2.00.raw.xz
  • +
  • co.2014_1.00.raw.xz
  • +
  • crs.2012.00.raw.xz
  • +
  • crs.2013_1.00.raw.xz
  • +
  • crs.2013_2.00.raw.xz
  • +
  • crs.2014_1.00.raw.xz
  • +
  • cs.2012.00.raw.xz
  • +
  • cs.2013_1.00.raw.xz
  • +
  • cs.2013_2.00.raw.xz
  • +
  • cs.2014_1.00.raw.xz
  • +
  • cy.2012.00.raw.xz
  • +
  • cy.2013_1.00.raw.xz
  • +
  • cy.2013_2.00.raw.xz
  • +
  • cy.2014_1.00.raw.xz
  • +
  • da.2012.00.raw.xz
  • +
  • da.2013_1.00.raw.xz
  • +
  • da.2013_2.00.raw.xz
  • +
  • da.2014_1.00.raw.xz
  • +
  • de.2012.00.raw.xz
  • +
  • de.2013_1.00.raw.xz
  • +
  • de.2013_2.00.raw.xz
  • +
  • de.2014_1.00.raw.xz
  • +
  • dv.2012.00.raw.xz
  • +
  • dv.2013_1.00.raw.xz
  • +
  • dv.2013_2.00.raw.xz
  • +
  • dv.2014_1.00.raw.xz
  • +
  • dz.2012.00.raw.xz
  • +
  • dz.2013_1.00.raw.xz
  • +
  • dz.2013_2.00.raw.xz
  • +
  • dz.2014_1.00.raw.xz
  • +
  • el.2012.00.raw.xz
  • +
  • el.2013_1.00.raw.xz
  • +
  • el.2013_2.00.raw.xz
  • +
  • el.2014_1.00.raw.xz
  • +
  • eo.2012.00.raw.xz
  • +
  • eo.2013_1.00.raw.xz
  • +
  • eo.2013_2.00.raw.xz
  • +
  • eo.2014_1.00.raw.xz
  • +
  • es.2012.00.raw.xz
  • +
  • es.2013_1.00.raw.xz
  • +
  • es.2013_2.00.raw.xz
  • +
  • es.2014_1.00.raw.xz
  • +
  • et.2012.00.raw.xz
  • +
  • et.2013_1.00.raw.xz
  • +
  • et.2013_2.00.raw.xz
  • +
  • et.2014_1.00.raw.xz
  • +
  • eu.2012.00.raw.xz
  • +
  • eu.2013_1.00.raw.xz
  • +
  • eu.2013_2.00.raw.xz
  • +
  • eu.2014_1.00.raw.xz
  • +
  • fa.2012.00.raw.xz
  • +
  • fa.2013_1.00.raw.xz
  • +
  • fa.2013_2.00.raw.xz
  • +
  • fa.2014_1.00.raw.xz
  • +
  • fi.2012.00.raw.xz
  • +
  • fi.2013_1.00.raw.xz
  • +
  • fi.2013_2.00.raw.xz
  • +
  • fi.2014_1.00.raw.xz
  • +
  • fj.2012.00.raw.xz
  • +
  • fj.2013_1.00.raw.xz
  • +
  • fj.2013_2.00.raw.xz
  • +
  • fj.2014_1.00.raw.xz
  • +
  • fo.2012.00.raw.xz
  • +
  • fo.2013_1.00.raw.xz
  • +
  • fo.2013_2.00.raw.xz
  • +
  • fo.2014_1.00.raw.xz
  • +
  • fr.2012.00.raw.xz
  • +
  • fr.2013_1.00.raw.xz
  • +
  • fr.2013_2.00.raw.xz
  • +
  • fr.2014_1.00.raw.xz
  • +
  • fy.2012.00.raw.xz
  • +
  • fy.2013_1.00.raw.xz
  • +
  • fy.2013_2.00.raw.xz
  • +
  • fy.2014_1.00.raw.xz
  • +
  • ga.2012.00.raw.xz
  • +
  • ga.2013_1.00.raw.xz
  • +
  • ga.2013_2.00.raw.xz
  • +
  • ga.2014_1.00.raw.xz
  • +
  • gd.2012.00.raw.xz
  • +
  • gd.2013_1.00.raw.xz
  • +
  • gd.2013_2.00.raw.xz
  • +
  • gd.2014_1.00.raw.xz
  • +
  • gl.2012.00.raw.xz
  • +
  • gl.2013_1.00.raw.xz
  • +
  • gl.2013_2.00.raw.xz
  • +
  • gl.2014_1.00.raw.xz
  • +
  • gn.2012.00.raw.xz
  • +
  • gn.2013_1.00.raw.xz
  • +
  • gn.2013_2.00.raw.xz
  • +
  • gn.2014_1.00.raw.xz
  • +
  • gu.2012.00.raw.xz
  • +
  • gu.2013_1.00.raw.xz
  • +
  • gu.2013_2.00.raw.xz
  • +
  • gu.2014_1.00.raw.xz
  • +
  • gv.2012.00.raw.xz
  • +
  • gv.2013_1.00.raw.xz
  • +
  • gv.2013_2.00.raw.xz
  • +
  • gv.2014_1.00.raw.xz
  • +
  • ha.2012.00.raw.xz
  • +
  • ha.2013_1.00.raw.xz
  • +
  • ha.2013_2.00.raw.xz
  • +
  • ha.2014_1.00.raw.xz
  • +
  • haw.2012.00.raw.xz
  • +
  • haw.2013_1.00.raw.xz
  • +
  • haw.2013_2.00.raw.xz
  • +
  • haw.2014_1.00.raw.xz
  • +
  • hi.2012.00.raw.xz
  • +
  • hi.2013_1.00.raw.xz
  • +
  • hi.2013_2.00.raw.xz
  • +
  • hi.2014_1.00.raw.xz
  • +
  • hr.2012.00.raw.xz
  • +
  • hr.2013_1.00.raw.xz
  • +
  • hr.2013_2.00.raw.xz
  • +
  • hr.2014_1.00.raw.xz
  • +
  • ht.2012.00.raw.xz
  • +
  • ht.2013_1.00.raw.xz
  • +
  • ht.2013_2.00.raw.xz
  • +
  • ht.2014_1.00.raw.xz
  • +
  • hu.2012.00.raw.xz
  • +
  • hu.2013_1.00.raw.xz
  • +
  • hu.2013_2.00.raw.xz
  • +
  • hu.2014_1.00.raw.xz
  • +
  • hy.2012.00.raw.xz
  • +
  • hy.2013_1.00.raw.xz
  • +
  • hy.2013_2.00.raw.xz
  • +
  • hy.2014_1.00.raw.xz
  • +
  • ia.2012.00.raw.xz
  • +
  • ia.2013_1.00.raw.xz
  • +
  • ia.2013_2.00.raw.xz
  • +
  • ia.2014_1.00.raw.xz
  • +
  • id.2012.00.raw.xz
  • +
  • id.2013_1.00.raw.xz
  • +
  • id.2013_2.00.raw.xz
  • +
  • id.2014_1.00.raw.xz
  • +
  • ie.2012.00.raw.xz
  • +
  • ie.2013_1.00.raw.xz
  • +
  • ie.2013_2.00.raw.xz
  • +
  • ie.2014_1.00.raw.xz
  • +
  • ig.2012.00.raw.xz
  • +
  • ig.2013_1.00.raw.xz
  • +
  • ig.2013_2.00.raw.xz
  • +
  • ig.2014_1.00.raw.xz
  • +
  • ik.2012.00.raw.xz
  • +
  • ik.2013_1.00.raw.xz
  • +
  • ik.2013_2.00.raw.xz
  • +
  • ik.2014_1.00.raw.xz
  • +
  • is.2012.00.raw.xz
  • +
  • is.2013_1.00.raw.xz
  • +
  • is.2013_2.00.raw.xz
  • +
  • is.2014_1.00.raw.xz
  • +
  • it.2012.00.raw.xz
  • +
  • it.2013_1.00.raw.xz
  • +
  • it.2013_2.00.raw.xz
  • +
  • it.2014_1.00.raw.xz
  • +
  • iu.2012.00.raw.xz
  • +
  • iu.2013_1.00.raw.xz
  • +
  • iu.2013_2.00.raw.xz
  • +
  • iu.2014_1.00.raw.xz
  • +
  • iw.2012.00.raw.xz
  • +
  • iw.2013_1.00.raw.xz
  • +
  • iw.2013_2.00.raw.xz
  • +
  • iw.2014_1.00.raw.xz
  • +
  • ja.2012.00.raw.xz
  • +
  • ja.2013_1.00.raw.xz
  • +
  • ja.2013_2.00.raw.xz
  • +
  • ja.2014_1.00.raw.xz
  • +
  • jw.2012.00.raw.xz
  • +
  • jw.2013_1.00.raw.xz
  • +
  • jw.2013_2.00.raw.xz
  • +
  • jw.2014_1.00.raw.xz
  • +
  • ka.2012.00.raw.xz
  • +
  • ka.2013_1.00.raw.xz
  • +
  • ka.2013_2.00.raw.xz
  • +
  • ka.2014_1.00.raw.xz
  • +
  • kha.2012.00.raw.xz
  • +
  • kha.2013_1.00.raw.xz
  • +
  • kha.2013_2.00.raw.xz
  • +
  • kha.2014_1.00.raw.xz
  • +
  • kk.2012.00.raw.xz
  • +
  • kk.2013_1.00.raw.xz
  • +
  • kk.2013_2.00.raw.xz
  • +
  • kk.2014_1.00.raw.xz
  • +
  • kl.2012.00.raw.xz
  • +
  • kl.2013_1.00.raw.xz
  • +
  • kl.2013_2.00.raw.xz
  • +
  • kl.2014_1.00.raw.xz
  • +
  • km.2012.00.raw.xz
  • +
  • km.2013_1.00.raw.xz
  • +
  • km.2013_2.00.raw.xz
  • +
  • km.2014_1.00.raw.xz
  • +
  • kn.2012.00.raw.xz
  • +
  • kn.2013_1.00.raw.xz
  • +
  • kn.2013_2.00.raw.xz
  • +
  • kn.2014_1.00.raw.xz
  • +
  • ko.2012.00.raw.xz
  • +
  • ko.2013_1.00.raw.xz
  • +
  • ko.2013_2.00.raw.xz
  • +
  • ko.2014_1.00.raw.xz
  • +
  • ks.2012.00.raw.xz
  • +
  • ks.2013_1.00.raw.xz
  • +
  • ks.2013_2.00.raw.xz
  • +
  • ks.2014_1.00.raw.xz
  • +
  • ku.2012.00.raw.xz
  • +
  • ku.2013_1.00.raw.xz
  • +
  • ku.2013_2.00.raw.xz
  • +
  • ku.2014_1.00.raw.xz
  • +
  • ky.2012.00.raw.xz
  • +
  • ky.2013_1.00.raw.xz
  • +
  • ky.2013_2.00.raw.xz
  • +
  • ky.2014_1.00.raw.xz
  • +
  • la.2012.00.raw.xz
  • +
  • la.2013_1.00.raw.xz
  • +
  • la.2013_2.00.raw.xz
  • +
  • la.2014_1.00.raw.xz
  • +
  • lb.2012.00.raw.xz
  • +
  • lb.2013_1.00.raw.xz
  • +
  • lb.2013_2.00.raw.xz
  • +
  • lb.2014_1.00.raw.xz
  • +
  • lg.2012.00.raw.xz
  • +
  • lg.2013_1.00.raw.xz
  • +
  • lg.2013_2.00.raw.xz
  • +
  • lg.2014_1.00.raw.xz
  • +
  • lif.2012.00.raw.xz
  • +
  • lif.2013_1.00.raw.xz
  • +
  • lif.2013_2.00.raw.xz
  • +
  • lif.2014_1.00.raw.xz
  • +
  • ln.2012.00.raw.xz
  • +
  • ln.2013_1.00.raw.xz
  • +
  • ln.2013_2.00.raw.xz
  • +
  • ln.2014_1.00.raw.xz
  • +
  • lo.2012.00.raw.xz
  • +
  • lo.2013_1.00.raw.xz
  • +
  • lo.2013_2.00.raw.xz
  • +
  • lo.2014_1.00.raw.xz
  • +
  • lt.2012.00.raw.xz
  • +
  • lt.2013_1.00.raw.xz
  • +
  • lt.2013_2.00.raw.xz
  • +
  • lt.2014_1.00.raw.xz
  • +
  • lv.2012.00.raw.xz
  • +
  • lv.2013_1.00.raw.xz
  • +
  • lv.2013_2.00.raw.xz
  • +
  • lv.2014_1.00.raw.xz
  • +
  • mfe.2012.00.raw.xz
  • +
  • mfe.2013_1.00.raw.xz
  • +
  • mfe.2013_2.00.raw.xz
  • +
  • mfe.2014_1.00.raw.xz
  • +
  • mg.2012.00.raw.xz
  • +
  • mg.2013_1.00.raw.xz
  • +
  • mg.2013_2.00.raw.xz
  • +
  • mg.2014_1.00.raw.xz
  • +
  • mi.2012.00.raw.xz
  • +
  • mi.2013_1.00.raw.xz
  • +
  • mi.2013_2.00.raw.xz
  • +
  • mi.2014_1.00.raw.xz
  • +
  • mk.2012.00.raw.xz
  • +
  • mk.2013_1.00.raw.xz
  • +
  • mk.2013_2.00.raw.xz
  • +
  • mk.2014_1.00.raw.xz
  • +
  • ml.2012.00.raw.xz
  • +
  • ml.2013_1.00.raw.xz
  • +
  • ml.2013_2.00.raw.xz
  • +
  • ml.2014_1.00.raw.xz
  • +
  • mn.2012.00.raw.xz
  • +
  • mn.2013_1.00.raw.xz
  • +
  • mn.2013_2.00.raw.xz
  • +
  • mn.2014_1.00.raw.xz
  • +
  • mr.2012.00.raw.xz
  • +
  • mr.2013_1.00.raw.xz
  • +
  • mr.2013_2.00.raw.xz
  • +
  • mr.2014_1.00.raw.xz
  • +
  • ms.2012.00.raw.xz
  • +
  • ms.2013_1.00.raw.xz
  • +
  • ms.2013_2.00.raw.xz
  • +
  • ms.2014_1.00.raw.xz
  • +
  • mt.2012.00.raw.xz
  • +
  • mt.2013_1.00.raw.xz
  • +
  • mt.2013_2.00.raw.xz
  • +
  • mt.2014_1.00.raw.xz
  • +
  • my.2012.00.raw.xz
  • +
  • my.2013_1.00.raw.xz
  • +
  • my.2013_2.00.raw.xz
  • +
  • my.2014_1.00.raw.xz
  • +
  • na.2012.00.raw.xz
  • +
  • na.2013_1.00.raw.xz
  • +
  • na.2013_2.00.raw.xz
  • +
  • na.2014_1.00.raw.xz
  • +
  • ne.2012.00.raw.xz
  • +
  • ne.2013_1.00.raw.xz
  • +
  • ne.2013_2.00.raw.xz
  • +
  • ne.2014_1.00.raw.xz
  • +
  • nl.2012.00.raw.xz
  • +
  • nl.2013_1.00.raw.xz
  • +
  • nl.2013_2.00.raw.xz
  • +
  • nl.2014_1.00.raw.xz
  • +
  • nn.2012.00.raw.xz
  • +
  • nn.2013_1.00.raw.xz
  • +
  • nn.2013_2.00.raw.xz
  • +
  • nn.2014_1.00.raw.xz
  • +
  • no.2012.00.raw.xz
  • +
  • no.2013_1.00.raw.xz
  • +
  • no.2013_2.00.raw.xz
  • +
  • no.2014_1.00.raw.xz
  • +
  • nso.2012.00.raw.xz
  • +
  • nso.2013_1.00.raw.xz
  • +
  • nso.2013_2.00.raw.xz
  • +
  • nso.2014_1.00.raw.xz
  • +
  • ny.2012.00.raw.xz
  • +
  • ny.2013_1.00.raw.xz
  • +
  • ny.2013_2.00.raw.xz
  • +
  • ny.2014_1.00.raw.xz
  • +
  • oc.2012.00.raw.xz
  • +
  • oc.2013_1.00.raw.xz
  • +
  • oc.2013_2.00.raw.xz
  • +
  • oc.2014_1.00.raw.xz
  • +
  • om.2012.00.raw.xz
  • +
  • om.2013_1.00.raw.xz
  • +
  • om.2013_2.00.raw.xz
  • +
  • om.2014_1.00.raw.xz
  • +
  • or.2012.00.raw.xz
  • +
  • or.2013_1.00.raw.xz
  • +
  • or.2013_2.00.raw.xz
  • +
  • or.2014_1.00.raw.xz
  • +
  • pa.2012.00.raw.xz
  • +
  • pa.2013_1.00.raw.xz
  • +
  • pa.2013_2.00.raw.xz
  • +
  • pa.2014_1.00.raw.xz
  • +
  • pl.2012.00.raw.xz
  • +
  • pl.2013_1.00.raw.xz
  • +
  • pl.2013_2.00.raw.xz
  • +
  • pl.2014_1.00.raw.xz
  • +
  • ps.2012.00.raw.xz
  • +
  • ps.2013_1.00.raw.xz
  • +
  • ps.2013_2.00.raw.xz
  • +
  • ps.2014_1.00.raw.xz
  • +
  • pt.2012.00.raw.xz
  • +
  • pt.2013_1.00.raw.xz
  • +
  • pt.2013_2.00.raw.xz
  • +
  • pt.2014_1.00.raw.xz
  • +
  • qu.2012.00.raw.xz
  • +
  • qu.2013_1.00.raw.xz
  • +
  • qu.2013_2.00.raw.xz
  • +
  • qu.2014_1.00.raw.xz
  • +
  • rm.2012.00.raw.xz
  • +
  • rm.2013_1.00.raw.xz
  • +
  • rm.2013_2.00.raw.xz
  • +
  • rm.2014_1.00.raw.xz
  • +
  • rn.2012.00.raw.xz
  • +
  • rn.2013_1.00.raw.xz
  • +
  • rn.2013_2.00.raw.xz
  • +
  • rn.2014_1.00.raw.xz
  • +
  • ro.2012.00.raw.xz
  • +
  • ro.2013_1.00.raw.xz
  • +
  • ro.2013_2.00.raw.xz
  • +
  • ro.2014_1.00.raw.xz
  • +
  • ru.2012.00.raw.xz
  • +
  • ru.2013_1.00.raw.xz
  • +
  • ru.2013_2.00.raw.xz
  • +
  • ru.2014_1.00.raw.xz
  • +
  • rw.2012.00.raw.xz
  • +
  • rw.2013_1.00.raw.xz
  • +
  • rw.2013_2.00.raw.xz
  • +
  • rw.2014_1.00.raw.xz
  • +
  • sa.2012.00.raw.xz
  • +
  • sa.2013_1.00.raw.xz
  • +
  • sa.2013_2.00.raw.xz
  • +
  • sa.2014_1.00.raw.xz
  • +
  • sco.2012.00.raw.xz
  • +
  • sco.2013_1.00.raw.xz
  • +
  • sco.2013_2.00.raw.xz
  • +
  • sco.2014_1.00.raw.xz
  • +
  • sd.2012.00.raw.xz
  • +
  • sd.2013_1.00.raw.xz
  • +
  • sd.2013_2.00.raw.xz
  • +
  • sd.2014_1.00.raw.xz
  • +
  • sg.2012.00.raw.xz
  • +
  • sg.2013_1.00.raw.xz
  • +
  • sg.2013_2.00.raw.xz
  • +
  • sg.2014_1.00.raw.xz
  • +
  • si.2012.00.raw.xz
  • +
  • si.2013_1.00.raw.xz
  • +
  • si.2013_2.00.raw.xz
  • +
  • si.2014_1.00.raw.xz
  • +
  • sk.2012.00.raw.xz
  • +
  • sk.2013_1.00.raw.xz
  • +
  • sk.2013_2.00.raw.xz
  • +
  • sk.2014_1.00.raw.xz
  • +
  • sl.2012.00.raw.xz
  • +
  • sl.2013_1.00.raw.xz
  • +
  • sl.2013_2.00.raw.xz
  • +
  • sl.2014_1.00.raw.xz
  • +
  • sm.2012.00.raw.xz
  • +
  • sm.2013_1.00.raw.xz
  • +
  • sm.2013_2.00.raw.xz
  • +
  • sm.2014_1.00.raw.xz
  • +
  • sn.2012.00.raw.xz
  • +
  • sn.2013_1.00.raw.xz
  • +
  • sn.2013_2.00.raw.xz
  • +
  • sn.2014_1.00.raw.xz
  • +
  • so.2012.00.raw.xz
  • +
  • so.2013_1.00.raw.xz
  • +
  • so.2013_2.00.raw.xz
  • +
  • so.2014_1.00.raw.xz
  • +
  • sq.2012.00.raw.xz
  • +
  • sq.2013_1.00.raw.xz
  • +
  • sq.2013_2.00.raw.xz
  • +
  • sq.2014_1.00.raw.xz
  • +
  • sr.2012.00.raw.xz
  • +
  • sr.2013_1.00.raw.xz
  • +
  • sr.2013_2.00.raw.xz
  • +
  • sr.2014_1.00.raw.xz
  • +
  • sr-ME.2012.00.raw.xz
  • +
  • sr-ME.2013_1.00.raw.xz
  • +
  • sr-ME.2013_2.00.raw.xz
  • +
  • sr-ME.2014_1.00.raw.xz
  • +
  • ss.2012.00.raw.xz
  • +
  • ss.2013_1.00.raw.xz
  • +
  • ss.2013_2.00.raw.xz
  • +
  • ss.2014_1.00.raw.xz
  • +
  • st.2012.00.raw.xz
  • +
  • st.2013_1.00.raw.xz
  • +
  • st.2013_2.00.raw.xz
  • +
  • st.2014_1.00.raw.xz
  • +
  • su.2012.00.raw.xz
  • +
  • su.2013_1.00.raw.xz
  • +
  • su.2013_2.00.raw.xz
  • +
  • su.2014_1.00.raw.xz
  • +
  • sv.2012.00.raw.xz
  • +
  • sv.2013_1.00.raw.xz
  • +
  • sv.2013_2.00.raw.xz
  • +
  • sv.2014_1.00.raw.xz
  • +
  • sw.2012.00.raw.xz
  • +
  • sw.2013_1.00.raw.xz
  • +
  • sw.2013_2.00.raw.xz
  • +
  • sw.2014_1.00.raw.xz
  • +
  • syr.2012.00.raw.xz
  • +
  • syr.2013_1.00.raw.xz
  • +
  • syr.2013_2.00.raw.xz
  • +
  • syr.2014_1.00.raw.xz
  • +
  • ta.2012.00.raw.xz
  • +
  • ta.2013_1.00.raw.xz
  • +
  • ta.2013_2.00.raw.xz
  • +
  • ta.2014_1.00.raw.xz
  • +
  • te.2012.00.raw.xz
  • +
  • te.2013_1.00.raw.xz
  • +
  • te.2013_2.00.raw.xz
  • +
  • te.2014_1.00.raw.xz
  • +
  • tg.2012.00.raw.xz
  • +
  • tg.2013_1.00.raw.xz
  • +
  • tg.2013_2.00.raw.xz
  • +
  • tg.2014_1.00.raw.xz
  • +
  • th.2012.00.raw.xz
  • +
  • th.2013_1.00.raw.xz
  • +
  • th.2013_2.00.raw.xz
  • +
  • th.2014_1.00.raw.xz
  • +
  • ti.2012.00.raw.xz
  • +
  • ti.2013_1.00.raw.xz
  • +
  • ti.2013_2.00.raw.xz
  • +
  • ti.2014_1.00.raw.xz
  • +
  • tk.2012.00.raw.xz
  • +
  • tk.2013_1.00.raw.xz
  • +
  • tk.2013_2.00.raw.xz
  • +
  • tk.2014_1.00.raw.xz
  • +
  • tl.2012.00.raw.xz
  • +
  • tl.2013_1.00.raw.xz
  • +
  • tl.2013_2.00.raw.xz
  • +
  • tl.2014_1.00.raw.xz
  • +
  • tlh.2012.00.raw.xz
  • +
  • tlh.2013_1.00.raw.xz
  • +
  • tlh.2013_2.00.raw.xz
  • +
  • tlh.2014_1.00.raw.xz
  • +
  • tn.2012.00.raw.xz
  • +
  • tn.2013_1.00.raw.xz
  • +
  • tn.2013_2.00.raw.xz
  • +
  • tn.2014_1.00.raw.xz
  • +
  • to.2012.00.raw.xz
  • +
  • to.2013_1.00.raw.xz
  • +
  • to.2013_2.00.raw.xz
  • +
  • to.2014_1.00.raw.xz
  • +
  • tr.2012.00.raw.xz
  • +
  • tr.2013_1.00.raw.xz
  • +
  • tr.2013_2.00.raw.xz
  • +
  • tr.2014_1.00.raw.xz
  • +
  • ts.2012.00.raw.xz
  • +
  • ts.2013_1.00.raw.xz
  • +
  • ts.2013_2.00.raw.xz
  • +
  • ts.2014_1.00.raw.xz
  • +
  • tt.2012.00.raw.xz
  • +
  • tt.2013_1.00.raw.xz
  • +
  • tt.2013_2.00.raw.xz
  • +
  • tt.2014_1.00.raw.xz
  • +
  • ug.2012.00.raw.xz
  • +
  • ug.2013_1.00.raw.xz
  • +
  • ug.2013_2.00.raw.xz
  • +
  • ug.2014_1.00.raw.xz
  • +
  • uk.2012.00.raw.xz
  • +
  • uk.2013_1.00.raw.xz
  • +
  • uk.2013_2.00.raw.xz
  • +
  • uk.2014_1.00.raw.xz
  • +
  • un.2012.00.raw.xz
  • +
  • un.2013_1.00.raw.xz
  • +
  • ur.2012.00.raw.xz
  • +
  • ur.2013_1.00.raw.xz
  • +
  • ur.2013_2.00.raw.xz
  • +
  • ur.2014_1.00.raw.xz
  • +
  • uz.2012.00.raw.xz
  • +
  • uz.2013_1.00.raw.xz
  • +
  • uz.2013_2.00.raw.xz
  • +
  • uz.2014_1.00.raw.xz
  • +
  • ve.2012.00.raw.xz
  • +
  • ve.2013_1.00.raw.xz
  • +
  • ve.2013_2.00.raw.xz
  • +
  • ve.2014_1.00.raw.xz
  • +
  • vi.2012.00.raw.xz
  • +
  • vi.2013_1.00.raw.xz
  • +
  • vi.2013_2.00.raw.xz
  • +
  • vi.2014_1.00.raw.xz
  • +
  • vo.2012.00.raw.xz
  • +
  • vo.2013_1.00.raw.xz
  • +
  • vo.2013_2.00.raw.xz
  • +
  • vo.2014_1.00.raw.xz
  • +
  • war.2012.00.raw.xz
  • +
  • war.2013_1.00.raw.xz
  • +
  • war.2013_2.00.raw.xz
  • +
  • war.2014_1.00.raw.xz
  • +
  • wo.2012.00.raw.xz
  • +
  • wo.2013_1.00.raw.xz
  • +
  • wo.2013_2.00.raw.xz
  • +
  • wo.2014_1.00.raw.xz
  • +
  • xh.2012.00.raw.xz
  • +
  • xh.2013_1.00.raw.xz
  • +
  • xh.2013_2.00.raw.xz
  • +
  • xh.2014_1.00.raw.xz
  • +
  • xx-Armi.2012.00.raw.xz
  • +
  • xx-Armi.2013_1.00.raw.xz
  • +
  • xx-Armi.2013_2.00.raw.xz
  • +
  • xx-Armi.2014_1.00.raw.xz
  • +
  • xx-Avst.2012.00.raw.xz
  • +
  • xx-Avst.2013_1.00.raw.xz
  • +
  • xx-Avst.2013_2.00.raw.xz
  • +
  • xx-Avst.2014_1.00.raw.xz
  • +
  • xx-Bali.2012.00.raw.xz
  • +
  • xx-Bali.2013_1.00.raw.xz
  • +
  • xx-Bali.2013_2.00.raw.xz
  • +
  • xx-Bali.2014_1.00.raw.xz
  • +
  • xx-Bamu.2012.00.raw.xz
  • +
  • xx-Bamu.2013_1.00.raw.xz
  • +
  • xx-Bamu.2013_2.00.raw.xz
  • +
  • xx-Bamu.2014_1.00.raw.xz
  • +
  • xx-Batk.2012.00.raw.xz
  • +
  • xx-Batk.2013_1.00.raw.xz
  • +
  • xx-Batk.2013_2.00.raw.xz
  • +
  • xx-Batk.2014_1.00.raw.xz
  • +
  • xx-Bopo.2012.00.raw.xz
  • +
  • xx-Bopo.2013_1.00.raw.xz
  • +
  • xx-Bopo.2013_2.00.raw.xz
  • +
  • xx-Bopo.2014_1.00.raw.xz
  • +
  • xx-Brah.2012.00.raw.xz
  • +
  • xx-Brah.2013_1.00.raw.xz
  • +
  • xx-Brah.2013_2.00.raw.xz
  • +
  • xx-Brah.2014_1.00.raw.xz
  • +
  • xx-Bugi.2012.00.raw.xz
  • +
  • xx-Bugi.2013_1.00.raw.xz
  • +
  • xx-Bugi.2013_2.00.raw.xz
  • +
  • xx-Bugi.2014_1.00.raw.xz
  • +
  • xx-Buhd.2012.00.raw.xz
  • +
  • xx-Buhd.2013_1.00.raw.xz
  • +
  • xx-Buhd.2013_2.00.raw.xz
  • +
  • xx-Buhd.2014_1.00.raw.xz
  • +
  • xx-Cakm.2012.00.raw.xz
  • +
  • xx-Cakm.2013_1.00.raw.xz
  • +
  • xx-Cakm.2013_2.00.raw.xz
  • +
  • xx-Cakm.2014_1.00.raw.xz
  • +
  • xx-Cari.2012.00.raw.xz
  • +
  • xx-Cari.2013_1.00.raw.xz
  • +
  • xx-Cari.2013_2.00.raw.xz
  • +
  • xx-Cari.2014_1.00.raw.xz
  • +
  • xx-Cham.2012.00.raw.xz
  • +
  • xx-Cham.2013_1.00.raw.xz
  • +
  • xx-Cham.2013_2.00.raw.xz
  • +
  • xx-Cham.2014_1.00.raw.xz
  • +
  • xx-Copt.2012.00.raw.xz
  • +
  • xx-Copt.2013_1.00.raw.xz
  • +
  • xx-Copt.2013_2.00.raw.xz
  • +
  • xx-Copt.2014_1.00.raw.xz
  • +
  • xx-Cprt.2012.00.raw.xz
  • +
  • xx-Cprt.2013_1.00.raw.xz
  • +
  • xx-Cprt.2013_2.00.raw.xz
  • +
  • xx-Cprt.2014_1.00.raw.xz
  • +
  • xx-Dsrt.2012.00.raw.xz
  • +
  • xx-Dsrt.2013_1.00.raw.xz
  • +
  • xx-Dsrt.2013_2.00.raw.xz
  • +
  • xx-Dsrt.2014_1.00.raw.xz
  • +
  • xx-Egyp.2012.00.raw.xz
  • +
  • xx-Egyp.2013_1.00.raw.xz
  • +
  • xx-Egyp.2013_2.00.raw.xz
  • +
  • xx-Egyp.2014_1.00.raw.xz
  • +
  • xx-Glag.2012.00.raw.xz
  • +
  • xx-Glag.2013_1.00.raw.xz
  • +
  • xx-Glag.2013_2.00.raw.xz
  • +
  • xx-Glag.2014_1.00.raw.xz
  • +
  • xx-Goth.2012.00.raw.xz
  • +
  • xx-Goth.2013_1.00.raw.xz
  • +
  • xx-Goth.2013_2.00.raw.xz
  • +
  • xx-Goth.2014_1.00.raw.xz
  • +
  • xx-Hano.2012.00.raw.xz
  • +
  • xx-Hano.2013_1.00.raw.xz
  • +
  • xx-Hano.2013_2.00.raw.xz
  • +
  • xx-Hano.2014_1.00.raw.xz
  • +
  • xx-Ital.2012.00.raw.xz
  • +
  • xx-Ital.2013_1.00.raw.xz
  • +
  • xx-Ital.2013_2.00.raw.xz
  • +
  • xx-Ital.2014_1.00.raw.xz
  • +
  • xx-Java.2012.00.raw.xz
  • +
  • xx-Java.2013_1.00.raw.xz
  • +
  • xx-Java.2013_2.00.raw.xz
  • +
  • xx-Java.2014_1.00.raw.xz
  • +
  • xx-Kali.2012.00.raw.xz
  • +
  • xx-Kali.2013_1.00.raw.xz
  • +
  • xx-Kali.2013_2.00.raw.xz
  • +
  • xx-Kali.2014_1.00.raw.xz
  • +
  • xx-Khar.2012.00.raw.xz
  • +
  • xx-Khar.2013_1.00.raw.xz
  • +
  • xx-Khar.2013_2.00.raw.xz
  • +
  • xx-Khar.2014_1.00.raw.xz
  • +
  • xx-Kthi.2012.00.raw.xz
  • +
  • xx-Kthi.2013_1.00.raw.xz
  • +
  • xx-Kthi.2013_2.00.raw.xz
  • +
  • xx-Kthi.2014_1.00.raw.xz
  • +
  • xx-Lana.2012.00.raw.xz
  • +
  • xx-Lana.2013_1.00.raw.xz
  • +
  • xx-Lana.2013_2.00.raw.xz
  • +
  • xx-Lana.2014_1.00.raw.xz
  • +
  • xx-Lepc.2012.00.raw.xz
  • +
  • xx-Lepc.2013_1.00.raw.xz
  • +
  • xx-Lepc.2013_2.00.raw.xz
  • +
  • xx-Lepc.2014_1.00.raw.xz
  • +
  • xx-Linb.2012.00.raw.xz
  • +
  • xx-Linb.2013_1.00.raw.xz
  • +
  • xx-Linb.2013_2.00.raw.xz
  • +
  • xx-Linb.2014_1.00.raw.xz
  • +
  • xx-Lisu.2012.00.raw.xz
  • +
  • xx-Lisu.2013_1.00.raw.xz
  • +
  • xx-Lisu.2013_2.00.raw.xz
  • +
  • xx-Lisu.2014_1.00.raw.xz
  • +
  • xx-Lyci.2012.00.raw.xz
  • +
  • xx-Lyci.2013_1.00.raw.xz
  • +
  • xx-Lyci.2013_2.00.raw.xz
  • +
  • xx-Lyci.2014_1.00.raw.xz
  • +
  • xx-Lydi.2012.00.raw.xz
  • +
  • xx-Lydi.2013_1.00.raw.xz
  • +
  • xx-Lydi.2013_2.00.raw.xz
  • +
  • xx-Lydi.2014_1.00.raw.xz
  • +
  • xx-Mand.2012.00.raw.xz
  • +
  • xx-Mand.2013_1.00.raw.xz
  • +
  • xx-Mand.2013_2.00.raw.xz
  • +
  • xx-Mand.2014_1.00.raw.xz
  • +
  • xx-Merc.2012.00.raw.xz
  • +
  • xx-Merc.2013_1.00.raw.xz
  • +
  • xx-Merc.2013_2.00.raw.xz
  • +
  • xx-Merc.2014_1.00.raw.xz
  • +
  • xx-Mero.2012.00.raw.xz
  • +
  • xx-Mero.2013_1.00.raw.xz
  • +
  • xx-Mero.2013_2.00.raw.xz
  • +
  • xx-Mero.2014_1.00.raw.xz
  • +
  • xx-Mtei.2012.00.raw.xz
  • +
  • xx-Mtei.2013_1.00.raw.xz
  • +
  • xx-Mtei.2013_2.00.raw.xz
  • +
  • xx-Mtei.2014_1.00.raw.xz
  • +
  • xx-Nkoo.2012.00.raw.xz
  • +
  • xx-Nkoo.2013_1.00.raw.xz
  • +
  • xx-Nkoo.2013_2.00.raw.xz
  • +
  • xx-Nkoo.2014_1.00.raw.xz
  • +
  • xx-Ogam.2012.00.raw.xz
  • +
  • xx-Ogam.2013_1.00.raw.xz
  • +
  • xx-Ogam.2013_2.00.raw.xz
  • +
  • xx-Ogam.2014_1.00.raw.xz
  • +
  • xx-Olck.2012.00.raw.xz
  • +
  • xx-Olck.2013_1.00.raw.xz
  • +
  • xx-Olck.2013_2.00.raw.xz
  • +
  • xx-Olck.2014_1.00.raw.xz
  • +
  • xx-Orkh.2012.00.raw.xz
  • +
  • xx-Orkh.2013_1.00.raw.xz
  • +
  • xx-Orkh.2013_2.00.raw.xz
  • +
  • xx-Orkh.2014_1.00.raw.xz
  • +
  • xx-Osma.2012.00.raw.xz
  • +
  • xx-Osma.2013_1.00.raw.xz
  • +
  • xx-Osma.2013_2.00.raw.xz
  • +
  • xx-Osma.2014_1.00.raw.xz
  • +
  • xx-Phag.2012.00.raw.xz
  • +
  • xx-Phag.2013_1.00.raw.xz
  • +
  • xx-Phag.2013_2.00.raw.xz
  • +
  • xx-Phag.2014_1.00.raw.xz
  • +
  • xx-Phli.2012.00.raw.xz
  • +
  • xx-Phli.2013_1.00.raw.xz
  • +
  • xx-Phli.2013_2.00.raw.xz
  • +
  • xx-Phli.2014_1.00.raw.xz
  • +
  • xx-Phnx.2012.00.raw.xz
  • +
  • xx-Phnx.2013_1.00.raw.xz
  • +
  • xx-Phnx.2013_2.00.raw.xz
  • +
  • xx-Phnx.2014_1.00.raw.xz
  • +
  • xx-Plrd.2012.00.raw.xz
  • +
  • xx-Plrd.2013_1.00.raw.xz
  • +
  • xx-Plrd.2013_2.00.raw.xz
  • +
  • xx-Plrd.2014_1.00.raw.xz
  • +
  • xx-Prti.2012.00.raw.xz
  • +
  • xx-Prti.2013_1.00.raw.xz
  • +
  • xx-Prti.2013_2.00.raw.xz
  • +
  • xx-Prti.2014_1.00.raw.xz
  • +
  • xx-Qaai.2012.00.raw.xz
  • +
  • xx-Qaai.2013_1.00.raw.xz
  • +
  • xx-Qaai.2013_2.00.raw.xz
  • +
  • xx-Qaai.2014_1.00.raw.xz
  • +
  • xx-Rjng.2012.00.raw.xz
  • +
  • xx-Rjng.2013_1.00.raw.xz
  • +
  • xx-Rjng.2013_2.00.raw.xz
  • +
  • xx-Rjng.2014_1.00.raw.xz
  • +
  • xx-Runr.2012.00.raw.xz
  • +
  • xx-Runr.2013_1.00.raw.xz
  • +
  • xx-Runr.2013_2.00.raw.xz
  • +
  • xx-Runr.2014_1.00.raw.xz
  • +
  • xx-Samr.2012.00.raw.xz
  • +
  • xx-Samr.2013_1.00.raw.xz
  • +
  • xx-Samr.2013_2.00.raw.xz
  • +
  • xx-Samr.2014_1.00.raw.xz
  • +
  • xx-Sarb.2012.00.raw.xz
  • +
  • xx-Sarb.2013_1.00.raw.xz
  • +
  • xx-Sarb.2013_2.00.raw.xz
  • +
  • xx-Sarb.2014_1.00.raw.xz
  • +
  • xx-Saur.2012.00.raw.xz
  • +
  • xx-Saur.2013_1.00.raw.xz
  • +
  • xx-Saur.2013_2.00.raw.xz
  • +
  • xx-Saur.2014_1.00.raw.xz
  • +
  • xx-Shaw.2012.00.raw.xz
  • +
  • xx-Shaw.2013_1.00.raw.xz
  • +
  • xx-Shaw.2013_2.00.raw.xz
  • +
  • xx-Shaw.2014_1.00.raw.xz
  • +
  • xx-Shrd.2012.00.raw.xz
  • +
  • xx-Shrd.2013_1.00.raw.xz
  • +
  • xx-Shrd.2013_2.00.raw.xz
  • +
  • xx-Shrd.2014_1.00.raw.xz
  • +
  • xx-Sora.2012.00.raw.xz
  • +
  • xx-Sora.2013_1.00.raw.xz
  • +
  • xx-Sora.2013_2.00.raw.xz
  • +
  • xx-Sora.2014_1.00.raw.xz
  • +
  • xx-Sund.2012.00.raw.xz
  • +
  • xx-Sund.2013_1.00.raw.xz
  • +
  • xx-Sund.2013_2.00.raw.xz
  • +
  • xx-Sund.2014_1.00.raw.xz
  • +
  • xx-Sylo.2012.00.raw.xz
  • +
  • xx-Sylo.2013_1.00.raw.xz
  • +
  • xx-Sylo.2013_2.00.raw.xz
  • +
  • xx-Sylo.2014_1.00.raw.xz
  • +
  • xx-Tagb.2012.00.raw.xz
  • +
  • xx-Tagb.2013_1.00.raw.xz
  • +
  • xx-Tagb.2013_2.00.raw.xz
  • +
  • xx-Tagb.2014_1.00.raw.xz
  • +
  • xx-Takr.2012.00.raw.xz
  • +
  • xx-Takr.2013_1.00.raw.xz
  • +
  • xx-Takr.2013_2.00.raw.xz
  • +
  • xx-Takr.2014_1.00.raw.xz
  • +
  • xx-Tale.2012.00.raw.xz
  • +
  • xx-Tale.2013_1.00.raw.xz
  • +
  • xx-Tale.2013_2.00.raw.xz
  • +
  • xx-Tale.2014_1.00.raw.xz
  • +
  • xx-Talu.2012.00.raw.xz
  • +
  • xx-Talu.2013_1.00.raw.xz
  • +
  • xx-Talu.2013_2.00.raw.xz
  • +
  • xx-Talu.2014_1.00.raw.xz
  • +
  • xx-Tavt.2012.00.raw.xz
  • +
  • xx-Tavt.2013_1.00.raw.xz
  • +
  • xx-Tavt.2013_2.00.raw.xz
  • +
  • xx-Tavt.2014_1.00.raw.xz
  • +
  • xx-Tfng.2012.00.raw.xz
  • +
  • xx-Tfng.2013_1.00.raw.xz
  • +
  • xx-Tfng.2013_2.00.raw.xz
  • +
  • xx-Tfng.2014_1.00.raw.xz
  • +
  • xx-Ugar.2012.00.raw.xz
  • +
  • xx-Ugar.2013_1.00.raw.xz
  • +
  • xx-Ugar.2013_2.00.raw.xz
  • +
  • xx-Ugar.2014_1.00.raw.xz
  • +
  • xx-Vaii.2012.00.raw.xz
  • +
  • xx-Vaii.2013_1.00.raw.xz
  • +
  • xx-Vaii.2013_2.00.raw.xz
  • +
  • xx-Vaii.2014_1.00.raw.xz
  • +
  • xx-Xpeo.2012.00.raw.xz
  • +
  • xx-Xpeo.2013_1.00.raw.xz
  • +
  • xx-Xpeo.2013_2.00.raw.xz
  • +
  • xx-Xpeo.2014_1.00.raw.xz
  • +
  • xx-Xsux.2012.00.raw.xz
  • +
  • xx-Xsux.2013_1.00.raw.xz
  • +
  • xx-Xsux.2013_2.00.raw.xz
  • +
  • xx-Xsux.2014_1.00.raw.xz
  • +
  • xx-Yiii.2012.00.raw.xz
  • +
  • xx-Yiii.2013_1.00.raw.xz
  • +
  • xx-Yiii.2013_2.00.raw.xz
  • +
  • xx-Yiii.2014_1.00.raw.xz
  • +
  • yi.2012.00.raw.xz
  • +
  • yi.2013_1.00.raw.xz
  • +
  • yi.2013_2.00.raw.xz
  • +
  • yi.2014_1.00.raw.xz
  • +
  • yo.2012.00.raw.xz
  • +
  • yo.2013_1.00.raw.xz
  • +
  • yo.2013_2.00.raw.xz
  • +
  • yo.2014_1.00.raw.xz
  • +
  • za.2012.00.raw.xz
  • +
  • za.2013_1.00.raw.xz
  • +
  • za.2013_2.00.raw.xz
  • +
  • za.2014_1.00.raw.xz
  • +
  • zh.2012.00.raw.xz
  • +
  • zh.2013_1.00.raw.xz
  • +
  • zh.2013_2.00.raw.xz
  • +
  • zh.2014_1.00.raw.xz
  • +
  • zh-Hant.2012.00.raw.xz
  • +
  • zh-Hant.2013_1.00.raw.xz
  • +
  • zh-Hant.2013_2.00.raw.xz
  • +
  • zh-Hant.2014_1.00.raw.xz
  • +
  • zu.2012.00.raw.xz
  • +
  • zu.2013_1.00.raw.xz
  • +
  • zu.2013_2.00.raw.xz
  • +
  • zu.2014_1.00.raw.xz
  • +
  • zzp.2012.00.raw.xz
  • +
  • zzp.2013_1.00.raw.xz
  • +
  • zzp.2013_2.00.raw.xz
  • +
  • zzp.2014_1.00.raw.xz
  • diff --git a/s3/make_non_en_public.sh b/s3/make_non_en_public.sh new file mode 100755 index 0000000..8956688 --- /dev/null +++ b/s3/make_non_en_public.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +RAW_DIR="/fs/vali0/www/data.statmt.org/ngrams/raw" +LANGS=$(ls ${RAW_DIR}/*.xz | awk 'BEGIN {FS="/"} {print $(NF)}' | cut -d '.' -f 1 | uniq) + +for lang in $LANGS; do + s3cmd setacl --acl-public -r "s3://web-language-models/ngrams/${lang}/" +done diff --git a/s3/rename_raw.sh b/s3/rename_raw.sh new file mode 100755 index 0000000..22fd6e0 --- /dev/null +++ b/s3/rename_raw.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +LANGUAGES="$1" +DELETIONS="$2" +DELETION_ERRORS="$3" +RENAMINGS="$4" +RENAMING_ERRORS="$5" + +# TODO: Dry test run. +# TODO: Test move and delete commands. + +for LANG in $(cat $LANGUAGES); do + S3_RAW_BUCKET="s3://web-language-models/ngrams/${LANG}/raw" + + S3_2014_1="${S3_RAW_BUCKET}/${LANG}.2014_1.00.raw.xz" + s3cmd info "${S3_2014_1}" > /dev/null 2>&1 + if [[ $? -eq 0 ]]; then + #s3cmd del "${S3_2014_1}" + if [[ $? -eq 0 ]]; then + echo "${S3_2014_1}" >> ${DELETIONS} + else + echo "${S3_2014_1}" >> ${DELETION_ERRORS} + fi + else + echo "${S3_2014_1}" >> ${DELETION_ERRORS} + fi + + for ID in 2012 2013_1 2013_2; do + S3_NAME="${S3_RAW_BUCKET}/${LANG}.${ID}.00.raw.xz" + S3_NEW_NAME="${S3_RAW_BUCKET}/${LANG}.${ID}.raw.xz" + s3cmd ls "${S3_NAME}" > /dev/null 2>&1 + if [[ $? -eq 0 ]]; then + #s3cmd mv "${S3_NAME}" "${S3_NEW_NAME}" + if [[ $? -eq 0 ]]; then + echo "${S3_NAME} ${S3_NEW_NAME}" >> ${RENAMINGS} + else + echo "${S3_NAME}" >> ${RENAMING_ERRORS} + fi + else + echo "${S3_NAME}" >> ${RENAMING_ERRORS} + fi + done +done diff --git a/s3/s3_copy.py b/s3/s3_copy.py new file mode 100755 index 0000000..6a77979 --- /dev/null +++ b/s3/s3_copy.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +# ngrams//fs/nas/eikthyrnir0/tim/cc/deduped/all_years/de/deduped/de.deduped.xz + +import argparse +import boto3 +from boto3.s3.transfer import TransferConfig + +parser = argparse.ArgumentParser() +parser.add_argument('-chunksize', type=int, default=1000, help='size of each part in MB') +parser.add_argument('-sourcekey', help='source object location') +parser.add_argument('-targetkey', help='location to copy to') +args = parser.parse_args() + +# Convert chunksize from MB to bytes +chunksize = args.chunksize * 1000000 +transferConfig = TransferConfig(multipart_threshold=chunksize, multipart_chunksize=chunksize) + +s3 = boto3.resource('s3') +copy_source = { + 'Bucket': 'web-language-models', + 'Key': args.sourcekey +} +s3.meta.client.copy(copy_source, 'web-language-models', args.targetkey, Config=transferConfig) diff --git a/s3/upload_deduped.sh b/s3/upload_deduped.sh new file mode 100755 index 0000000..9cd64fd --- /dev/null +++ b/s3/upload_deduped.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -e + +FILE="$1" +LANG=$(basename $FILE | cut -d . -f 1) + +/home/tim/bin/s3cmd/s3cmd del "s3://web-language-models/ngrams/${LANG}/deduped/${LANG}.deduped.xz" +/home/tim/bin/s3cmd/s3cmd put --multipart-chunk-size-mb=1000 "${FILE}" "s3://web-language-models/ngrams/${LANG}/deduped/" diff --git a/s3/upload_raw.sh b/s3/upload_raw.sh new file mode 100755 index 0000000..dd2ccec --- /dev/null +++ b/s3/upload_raw.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +FILE=$(basename $1 | sed 's/2013_2/2013_48/g' | sed 's/2013_1/2013_20/g') +LANGUAGE=$(echo ${FILE} | cut -d . -f 1) + +BUCKET="s3://web-language-models/ngrams/${LANGUAGE}/raw/${FILE}" + +/fs/zisa0/tim/dev/s3cmd/s3cmd put -q --continue-put --multipart-chunk-size-mb=1000 $1 ${BUCKET}