Initial commit

treigerm · Aug 10, 2017 · 6670c94 · 6670c94
commit 6670c94
Show file tree

Hide file tree

Showing 33 changed files with 2,760 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+jhu/
diff --git a/LOCATIONS.md b/LOCATIONS.md
@@ -0,0 +1,46 @@
+# Locations of data
+
+## /fs/zisa0/commoncrawl
+
+- 2015_27 raw non-english
+- 2016_30 raw non-english
+- 2017_17 experiments with extracting parallel text
+
+## /fs/freyja0/commoncrawl
+
+- 2015_06 raw non-english
+- 2015_27 langsplit files
+- 2015_30 langsplit files
+- 2017_17 langsplit files
+
+## /fs/mimir0/commoncrawl
+
+- 2015_06 english raw
+- 2015_11, 2015_14, 2015_18, 2015_22, 2015_27, 2015_27, 2015_32, 2015_35, 2015_40, 2015_48, 2016_50, 2017_17 all raw
+
+## /fs/nas/tim/cc
+
+- 2015_11, 2015_14 english raw
+- deduped files for ar, cs, de, es, fr, it, pl, ru
+
+## /fs/nas/heithrun0/commoncrawl/langsplit
+
+- langsplit files for all crawls from 2013_20 up to 2015_48 and for 2016_50
+- some scripts and files from Christian which seem to be related to the parallel corpus extraction
+
+## /fs/vili0/buck/cc/langsplit2/raw
+
+- non-english raw files for all 2014 crawls
+
+## /fs/vili0/buck/cc/langsplit2 and /fs/vili0/buck/cc/langsplit
+
+- temporary data between the langsplit files and the raw files for 2014 and 2015 crawls, potential candidate for deletion
+
+## /fs/vili0/www/data.statmt.org/ngrams
+
+- home directory of the "data.statmt.org/ngrams" website, contains symbolic links to old raw data
+
+## /fs/gna0/buck/cc/db
+
+- contains RocksDB Index data for all crawls from 2012 to 2015_40 + 2016_50; used in the parallel corpus extraction pipeline
+
diff --git a/TODO b/TODO
@@ -0,0 +1,3 @@
+- Create deduped files for all minor languages that are not present yet
+- Update english deduped files
+- Copy the English language trie on AWS S3
diff --git a/deduped/README.md b/deduped/README.md
@@ -0,0 +1,10 @@
+# Deduping .raw files
+
+## Dedupe
+
+
+## Shard and dedupe
+
+If the all of the raw data of one language is too big to fit into memory we have to shard the raw into multiple files. This is usually done with English.
+Before the sharding we do some minor processing of the raw data which removes lines with the document delimiter hash (df6fa1abb58549287111ba8d776733e9), 
+strip leading and trailing white space and remove lines with invalid UTF-8.
diff --git a/deduped/compress_shard.sh b/deduped/compress_shard.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+ID="$1"
+SHARD_DIR="$2"
+OUT_DIR="$3"
+
+INPUT_FILE="${SHARD_DIR}/en.tmp${ID}"
+OUTPUT_FILE="${OUT_DIR}/en.tmp${ID}.gz"
+DONEFILE="${OUTPUT_FILE}.done"
+
+if [[ -f "${DONEFILE}" ]]; then
+    exit 0
+fi
+
+< "${INPUT_FILE}" gzip -c > "${OUTPUT_FILE}"
+
+touch "${DONEFILE}"
diff --git a/deduped/deduped_from_shard.sh b/deduped/deduped_from_shard.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+# Non-zero padded id.
+ID="$1"
+SHARD_DIR="$2"
+PREVIOUS_DEDUPED_DIR="$3"
+OUT_DIR="$4"
+
+PREPROCESS_DIR="/fs/zisa0/tim/dev/preprocess/bin"
+
+PADDED_ID=$(printf "%02d" ${ID})
+INPUT_FILE="${SHARD_DIR}/en.tmp${ID}"
+OUTPUT_FILE="${OUT_DIR}/en.${PADDED_ID}.deduped.xz"
+DONEFILE="${OUTPUT_FILE}.done"
+
+PREVIOUS_DEDUPED_FILE="${PREVIOUS_DEDUPED_DIR}/en.${PADDED_ID}.deduped.xz"
+
+if [[ -f "${DONEFILE}" ]]; then
+    exit 0
+fi
+
+
+<"${INPUT_FILE}" ${PREPROCESS_DIR}/commoncrawl_dedupe ${PREVIOUS_DEDUPED_FILE} | xz > "${OUTPUT_FILE}"
+
+rm "${INPUT_FILE}"
+
+touch "${DONEFILE}"
diff --git a/deduped/shard_fifo.sh b/deduped/shard_fifo.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+RAW_DIR="$1"
+TMP_DIR="$2"
+
+PREPROCESS_DIR="/fs/zisa0/tim/dev/preprocess/bin"
+RAW_FILES="${RAW_DIR}/*.raw.xz"
+
+TMP_PREFIX="en.tmp"
+SHARD_COUNT=100
+
+# Create named pipes
+for i in $(seq 0 $((SHARD_COUNT-1))); do
+    mkfifo "${TMP_DIR}/${TMP_PREFIX}${i}"
+done
+
+# Clean raw files and shard them into pipes
+/fs/zisa0/tim/bin/xz -T10 -cd ${RAW_FILES} | \
+    ${PREPROCESS_DIR}/commoncrawl_clean | \
+    ${PREPROCESS_DIR}/shard_fifo ${TMP_DIR}/${TMP_PREFIX} ${SHARD_COUNT}
diff --git a/download/README.md b/download/README.md
@@ -0,0 +1,3 @@
+# Download CommonCrawl data
+
+Scripts for the monolingual pipeline as described in [here](https://github.com/ModernMT/DataCollection/blob/master/metadata/metadata.md). `setup.sh` creates all necessary directories and downloads all target urls. `download.sh` does the actual download. `count_downloads.sh` counts how many of the files are already downloaded.
diff --git a/download/count_downloads.sh b/download/count_downloads.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+total=0
+downloaded=0
+echo "$total"; echo -en "\e[1A"
+for path in `cat $1`; do
+    echo -e "\e[0K\r $total"; echo -en "\e[1A"
+    total=$((total+1))
+    FILENAME=$(echo $path | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF)}')
+    if [ -f ${FILENAME}.done ]; then
+        downloaded=$((downloaded+1))
+    fi
+done
+
+echo "$downloaded/$total"
+echo "Downloaded/Total"
diff --git a/download/download_wet.sh b/download/download_wet.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+FILENAME=$(echo $1 | awk ' BEGIN { FS = "/" } { print $(NF-2) "/" $(NF)}')
+
+if [ ! -f ${FILENAME}.done ]; then
+  curl -s $1 | gzip -cd | \
+  /fs/nas/heithrun0/commoncrawl/langsplit/bin/read_wet.py | \
+  /fs/nas/heithrun0/commoncrawl/langsplit/bin/langsplit --printchunks 2> /dev/null | \
+  xz -9 -e -T 2 > ${FILENAME}.langsplit.xz
+  touch ${FILENAME}.done
+fi
diff --git a/download/setup.sh b/download/setup.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+
+YEAR=$(echo $1 | awk ' BEGIN { FS = "_" } { print $1 }')
+WEEK=$(echo $1 | awk ' BEGIN { FS = "_" } { print $2 }')
+
+# Make directory for specified crawl
+mkdir -p ${1}/wet
+cd ${1}/wet
+
+# Download path file
+wget https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-${YEAR}-${WEEK}/wet.paths.gz
+
+# Convert to HTTPS URLs
+gzip -cd wet.paths.gz | sed 's/^/https:\/\/commoncrawl.s3.amazonaws.com\//' > wet.paths.http
+
+# Make subdirectories
+for f in `gzip -cd wet.paths.gz | cut -d '/' -f 4 | sort | uniq`; do mkdir -p $f; done;
diff --git a/raw/README.md b/raw/README.md
@@ -0,0 +1,25 @@
+# Creating .raw files
+
+## High-level description
+
+This pipeline takes the `*.langsplit.xz` files as input. Note that each crawl from CommonCrawl is usually split into 100 different shards. 
+However, this number is not necessarily consistent among all crawls (e.g. sometimes it might be 98). Each of those 100 different shards is in turn split into 
+several hundred files. For each of these files we have one `.langsplit.xz` file. 
+
+The script `collect_monolingual.sh` takes as input the directory name of one shard and reads all the `.langsplit.xz` files in that directory and splits them 
+according to language. The second argument of this script is the output directory. For each language `collect_monolingual.sh` writes a files with the name 
+`text.${language}.gz` to the output directory.
+
+Now since `collect_monolingual.sh` is called on each of the 100 shards separately we still have to concatenate all the different `text.${language}.gz` files 
+into one big `${language}.raw.xz` file. This is done with the `create_raw.sh` script. There is a separate `create_raw_en.sh` since we want to create 100 raw files
+for English because a single raw file for English would be too large.
+
+## Running the pipeline
+
+```bash
+ls * | parallel ./collect_monolingual.sh {} {}
+```
+
+```bash
+cat language.codes | parallel $crawl_dir $out_dir {}
+```
diff --git a/raw/collect_langs.py b/raw/collect_langs.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import sys
+import argparse
+
+magic_number = 'df6fa1abb58549287111ba8d776733e9'
+
+cld2_langcodes = ['en', 'da', 'nl', 'fi', 'fr', 'de', 'iw', 'it',
+                  'ja', 'ko', 'no', 'pl', 'pt', 'ru', 'es', 'sv',
+                  'zh', 'cs', 'el', 'is', 'lv', 'lt', 'ro', 'hu',
+                  'et', 'xxx', 'un', 'bg', 'hr', 'sr', 'ga', 'gl',
+                  'tl', 'tr', 'uk', 'hi', 'mk', 'bn', 'id', 'la',
+                  'ms', 'ml', 'cy', 'ne', 'te', 'sq', 'ta', 'be',
+                  'jw', 'oc', 'ur', 'bh', 'gu', 'th', 'ar', 'ca',
+                  'eo', 'eu', 'ia', 'kn', 'pa', 'gd', 'sw', 'sl',
+                  'mr', 'mt', 'vi', 'fy', 'sk', 'zh-Hant', 'fo',
+                  'su', 'uz', 'am', 'az', 'ka', 'ti', 'fa', 'bs',
+                  'si', 'nn', 'xh', 'zu', 'gn', 'st', 'tk', 'ky',
+                  'br', 'tw', 'yi', 'so', 'ug', 'ku', 'mn', 'hy',
+                  'lo', 'sd', 'rm', 'af', 'lb', 'my', 'km', 'bo',
+                  'dv', 'chr', 'syr', 'lif', 'or', 'as', 'co',
+                  'ie', 'kk', 'ln', 'mi', 'wo', 'ab', 'aa', 'ay',
+                  'ba', 'bi', 'dz', 'fj', 'kl', 'ha', 'ht', 'ik',
+                  'iu', 'ks', 'rw', 'mg', 'na', 'om', 'rn', 'sm',
+                  'sg', 'sa', 'ss', 'ts', 'tn', 'vo', 'za', 'kha',
+                  'sco', 'lg', 'gv', 'sr-ME', 'ak', 'ig', 'mfe',
+                  'haw', 'ceb', 'ee', 'gaa', 'blu', 'kri', 'loz',
+                  'lua', 'luo', 'new', 'ny', 'os', 'pam', 'nso',
+                  'raj', 'crs', 'tum', 've', 'war',  'nr', 'zzb',
+                  'zzp', 'zzh', 'tlh', 'zze', 'xx-Zyyy', 'xx-Latn',
+                  'xx-Grek', 'xx-Cyrl', 'xx-Armn', 'xx-Hebr',
+                  'xx-Arab', 'xx-Syrc', 'xx-Thaa', 'xx-Deva',
+                  'xx-Beng', 'xx-Guru', 'xx-Gujr', 'xx-Orya',
+                  'xx-Taml', 'xx-Telu', 'xx-Knda', 'xx-Mlym',
+                  'xx-Sinh', 'xx-Thai', 'xx-Laoo', 'xx-Tibt',
+                  'xx-Mymr', 'xx-Geor', 'xx-Hang', 'xx-Ethi',
+                  'xx-Cher', 'xx-Cans', 'xx-Ogam', 'xx-Runr',
+                  'xx-Khmr', 'xx-Mong', 'xx-Hira', 'xx-Kana',
+                  'xx-Bopo', 'xx-Hani', 'xx-Yiii', 'xx-Ital',
+                  'xx-Goth', 'xx-Dsrt', 'xx-Qaai', 'xx-Tglg',
+                  'xx-Hano', 'xx-Buhd', 'xx-Tagb', 'xx-Limb',
+                  'xx-Tale', 'xx-Linb', 'xx-Ugar', 'xx-Shaw',
+                  'xx-Osma', 'xx-Cprt', 'xx-Brai', 'xx-Bugi',
+                  'xx-Copt', 'xx-Talu', 'xx-Glag', 'xx-Tfng',
+                  'xx-Sylo', 'xx-Xpeo', 'xx-Khar', 'xx-Bali',
+                  'xx-Xsux', 'xx-Phnx', 'xx-Phag', 'xx-Nkoo',
+                  'xx-Sund', 'xx-Lepc', 'xx-Olck', 'xx-Vaii',
+                  'xx-Saur', 'xx-Kali', 'xx-Rjng', 'xx-Lyci',
+                  'xx-Cari', 'xx-Lydi', 'xx-Cham', 'xx-Lana',
+                  'xx-Tavt', 'xx-Avst', 'xx-Egyp', 'xx-Samr',
+                  'xx-Lisu', 'xx-Bamu', 'xx-Java', 'xx-Mtei',
+                  'xx-Armi', 'xx-Sarb', 'xx-Prti', 'xx-Phli',
+                  'xx-Orkh', 'xx-Kthi', 'xx-Batk', 'xx-Brah',
+                  'xx-Mand', 'xx-Cakm', 'xx-Merc', 'xx-Mero',
+                  'xx-Plrd', 'xx-Shrd', 'xx-Sora', 'xx-Takr']
+cld2_langcodes = [lc.replace('-', '_') for lc in cld2_langcodes]
+
+parser = argparse.ArgumentParser()
+for lc in cld2_langcodes:
+    parser.add_argument("-%s" % lc,
+                        help="outfile for %s data" % lc,
+                        type=argparse.FileType('wb'))
+args = parser.parse_args()
+
+lang2file = {}
+for lc in cld2_langcodes:
+    if getattr(args, lc) is not None:
+        lang2file[lc] = getattr(args, lc)
+
+
+buf = []
+current_lang = None
+
+for line in sys.stdin:
+    if line.startswith(magic_number):
+        if buf:
+            assert current_lang is not None
+            lang2file[current_lang].write("".join(buf))
+
+        current_lang = None
+        buf = []
+
+        for kv in line.strip().split():
+            if kv.startswith("language:"):
+                lang = kv.split(':', 1)[1]
+                if lang in lang2file:
+                    current_lang = lang
+
+    if current_lang:
+        buf.append(line)
+
+if buf:
+    assert current_lang is not None
+    lang2file[current_lang].write("".join(buf))
+
+for _, lang_file in lang2file.iteritems():
+    lang_file.flush()
+    lang_file.close()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Download CommonCrawl data

		Scripts for the monolingual pipeline as described in [here](https://github.com/ModernMT/DataCollection/blob/master/metadata/metadata.md). `setup.sh` creates all necessary directories and downloads all target urls. `download.sh` does the actual download. `count_downloads.sh` counts how many of the files are already downloaded.