Update dedupe directory

treigerm · Aug 10, 2017 · 0541a36 · 0541a36
1 parent 6670c94
commit 0541a36
Show file tree

Hide file tree

Showing 9 changed files with 157 additions and 1,888 deletions.
diff --git a/deduped/README.md b/deduped/README.md
@@ -8,3 +8,39 @@
 If the all of the raw data of one language is too big to fit into memory we have to shard the raw into multiple files. This is usually done with English.
 Before the sharding we do some minor processing of the raw data which removes lines with the document delimiter hash (df6fa1abb58549287111ba8d776733e9), 
 strip leading and trailing white space and remove lines with invalid UTF-8.
+
+## Running the scripts
+
+### Deduping without sharding
+
+Let's assume that all the raw files you want to dedupe are in `/path/to/raw` and are named `${language_code}.raw.2017_17.xz`. You want to store the new
+deduped files at `/path/to/deduped` and each language already has a deduped file at with the name `/path/to/${language_code}.deduped.xz`. Then deduping
+all languages in parallel can be done with:
+```bash
+cat language.codes | parallel ./dedupe.sh /path/to/raw/{}.2017_17.raw.xz /path/to/deduped {} /path/to/{}.deduped.xz
+```
+Here `language.codes` is a a list of the language codes that we want to deduped separated by newline. Note that the command also works if some languages
+don't already have a file at `/path/to/${language_code}.deduped.xz`.
+
+### Deduping with sharding
+
+<b>NOTE:</b> By default the sharding assumes that we are working on English data and shard into 100 files. However it should be trivial to add the language 
+code as an argument.
+
+Sharding the files:
+```bash
+./shard_fifo.sh /path/to/raw_files /path/to/fifos
+```
+
+Then open a new shell on the same machine and run:
+```bash
+seq 0 99 | parallel -j100 ./compress_shard.sh {} /path/to/fifos /path/to/shards
+```
+Here `/path/to/shards` is the output directory. It is important that you start all 100 jobs at once and that all of them are running on the same machine 
+otherwise the FIFOs don't work.
+
+Now dedupe the sharded files:
+```bash
+seq 0 99 | parallel ./dedupe_from_shard.sh {} /path/to/shards /path/to/previous_deduped_files /path/to/outdir
+```
+
diff --git a/deduped/dedupe.sh b/deduped/dedupe.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Exit as soon as any command fails.
+set -e
+set -o pipefail
+
+# Parallel command:
+# cat language.codes | parallel ./dedupe.sh ${infile} ${outdir} {} ${previous_deduped}
+
+DEDUPE_BIN=/fs/zisa0/tim/dev/preprocess/bin/commoncrawl_dedupe
+
+INFILE="$1"
+OUTDIR="$2"
+LANGUAGE="$3"
+PREVIOUS_DEDUPED="$4"
+
+OUTFILE="${OUTDIR}/${LANGUAGE}.deduped.xz"
+DONEFILE="${OUTFILE}.done"
+
+if [[ -f ${DONEFILE} ]]; then
+    exit 0
+fi
+
+if [[ -f ${DEDUPEDFILE} ]]; then
+    xz -cd "${INFILE}" | ${DEDUPE_BIN} "${PREVIOUS_DEDUPED}" | xz -c > "${OUTFILE}"
+else
+    xz -cd "${INFILE}" | ${DEDUPE_BIN} /dev/null | xz -c > "${OUTFILE}"
+fi
+
+touch "${DONEFILE}"
+
diff --git a/deduped/dedupe_from_shard.sh b/deduped/dedupe_from_shard.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+# Parallel command:
+# seq 0 99 | parallel ./dedupe_from_shard.sh {} ${shard_dir} ${previous_deduped_dir} ${out_dir}
+
+DEDUPE_BIN=/fs/zisa0/tim/dev/preprocess/bin/commoncrawl_dedupe
+
+# Non-zero padded id.
+ID="$1"
+SHARD_DIR="$2"
+PREVIOUS_DEDUPED_DIR="$3"
+OUT_DIR="$4"
+
+# The sharded from https://github.com/kpu/preprocess uses non-zero padded ids for the shard. But the
+# deduped files uses zero padded ids.
+PADDED_ID=$(printf "%02d" ${ID})
+INPUT_FILE="${SHARD_DIR}/en.tmp${ID}.gz"
+OUTPUT_FILE="${OUT_DIR}/en.${PADDED_ID}.deduped.xz"
+DONEFILE="${OUTPUT_FILE}.done"
+
+PREVIOUS_DEDUPED_FILE="${PREVIOUS_DEDUPED_DIR}/en.${PADDED_ID}.deduped.xz"
+
+if [[ -f "${DONEFILE}" ]]; then
+    exit 0
+fi
+
+
+gzip -cd "${INPUT_FILE}" | ${DEDUPE_BIN} "${PREVIOUS_DEDUPED_FILE}" | xz > "${OUTPUT_FILE}"
+
+touch "${DONEFILE}"
diff --git a/deduped/dedupe_hash_table.sh b/deduped/dedupe_hash_table.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Exit as soon as any command fails.
+set -e
+set -o pipefail
+
+# Parallel command:
+# cat language.codes | parallel ./dedupe_hash_table.sh ${infile} ${outdir} {} ${src_hash_table} ${out_hash_table} ${previous_deduped}
+
+DEDUPE_BIN=/fs/zisa/tim/dev/preprocess/bin/commoncrawl_dedupe_save_table
+
+INFILE="$1"
+OUTDIR="$2"
+LANGUAGE="$3"
+SRC_HASH_TABLE="$4"
+OUT_HASH_TABLE="$5"
+PREVIOUS_DEDUPED="$6"
+
+OUTFILE="${OUTDIR}/${LANGUAGE}.deduped.xz"
+DONEFILE="${OUTFILE}.done"
+
+if [[ -f ${DONEFILE} ]]; then
+    exit 0
+fi
+
+if [[ -f ${PREVIOUS_DEDUPED} ]]; then
+    xz -cd ${INFILE} | ${DEDUPE_BIN} "${PREVIOUS_DEDUPED}" "${SRC_HASH_TABLE}" "${OUT_HASH_TABLE}" | \
+        /fs/zisa0/tim/bin/xz -T 6 -c > "${OUTFILE}"
+else
+    xz -cd ${INFILE} | ${DEDUPE_BIN} /dev/null "${SRC_HASH_TABLE}" "${OUT_HASH_TABLE}" | \
+        /fs/zisa0/tim/bin/xz -T 6 -c > "${OUTFILE}"
+fi
+
+touch "${DONEFILE}"
+
diff --git a/deduped/deduped_from_shard.sh b/deduped/deduped_from_shard.sh
diff --git a/deduped/update_deduped_data.sh b/deduped/update_deduped_data.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Exit as soon as any command fails.
+set -e
+set -o pipefail
+
+DEDUPED="$1"
+NEW_DEDUPED="$2"
+OFFSET_FILE="$3"
+CRAWL_ID="$4"
+
+# NOTE: Offset file format (one file for each language). Offset shows where each crawl ends.
+#       Each crawl begins at the offset from the previous line.
+# {Crawl_ID} {offset1}
+# {Crawl_ID} {offset2}
+
+# Concat old deduped and new deduped.
+cat "${NEW_DEDUPED}" >> "${DEDUPED}"
+
+# Write new size to offset file.
+NEW_SIZE=$(stat --printf="%s" "${DEDUPED}")
+echo "${CRAWL_ID} ${NEW_SIZE}" >> "${OFFSET_FILE}"
diff --git a/s3/index_files/index_lm.html b/s3/index_files/index_lm.html