From 78818c7795d70c93c9e89c3130c86fa43265fc07 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Wed, 26 Apr 2023 16:48:34 +0300 Subject: [PATCH 01/35] Ignore vim swap files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 933109b..b39bdf1 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ /env/src/recode-* /env/src/hunspell-* filtered-terms.txt +*.swp From e56da940a03eb3aa752aad8955f89e5596bbf87c Mon Sep 17 00:00:00 2001 From: ZJaume Date: Wed, 26 Apr 2023 18:12:02 +0300 Subject: [PATCH 02/35] Update giashard install instructions It also adds go installation and dependency of giashard --- env/setup.d/giashard | 6 +++++- env/setup.d/go | 13 +++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 env/setup.d/go diff --git a/env/setup.d/giashard b/env/setup.d/giashard index 5a3fb5d..1003ffd 100644 --- a/env/setup.d/giashard +++ b/env/setup.d/giashard @@ -4,8 +4,12 @@ is-installed() { test -x $GOPATH/bin/giashard } +depends() { + echo go +} + install() { - go get -u github.com/paracrawl/giashard/cmd/giashard + go install github.com/paracrawl/giashard/cmd/giashard@latest } diff --git a/env/setup.d/go b/env/setup.d/go new file mode 100644 index 0000000..bdfced6 --- /dev/null +++ b/env/setup.d/go @@ -0,0 +1,13 @@ +#!/bin/bash + +is-installed() { + test -x $PREFIX/go/bin/go +} + +install() { + pushd .. + wget -O go.linux-amd64.tgz https://go.dev/dl/go1.20.3.linux-amd64.tar.gz + tar xvf go.linux-amd64.tgz + rm go.linux-amd64.tgz + popd +} From 10db8609058a73517df0873e21251cf5cd5a93cf Mon Sep 17 00:00:00 2001 From: ZJaume Date: Tue, 9 May 2023 17:34:51 +0300 Subject: [PATCH 03/35] LUMI config environment --- config.d/10.lumi.sh | 46 +++++++++++++++++++++++++++++++++++++++++++++ env/init.d/lumi.sh | 21 +++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 config.d/10.lumi.sh create mode 100644 env/init.d/lumi.sh diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh new file mode 100644 index 0000000..449049a --- /dev/null +++ b/config.d/10.lumi.sh @@ -0,0 +1,46 @@ +if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then + PROJ_DIR=/projappl/project_462000252/zaragoza + SCRATCH_DIR=/scratch/project_462000252/zaragoza + + function bicleaner_model { + local lang=$1 + + export BIFIXER_PARAMS="--aggressive_dedup -q" + export BICLEANER=$PREFIX/bin/bicleaner-classify-lite + export BICLEANER_THRESHOLD="0.5" + export BICLEANER_PARAMS="-q" # --score_only is always supplied + + # Default path: here instead of in config.csd3 because path depends on $lang and the exceptions + # above don't follow this pattern very well, which is why it's not in the 09.clean code itself. + export BICLEANER_MODEL=$PROJ_DIR/bicleaner-models/${TARGET_LANG%~*}-${lang%~*}/${TARGET_LANG%~*}-${lang%~*}.yaml + + export BIFIXER_PARAMS="--aggressive_dedup -q" + export BICLEANER=$PREFIX/bin/bicleaner-ai-classify + export BICLEANER_THRESHOLD="0.5" + export BICLEANER_PARAMS="-q" + export BICLEANER_MODEL=$PROJ_DIR/bicleaner-ai-models/full/${TARGET_LANG%~*}-${lang%~*}/metadata.yaml + } + + export DATA_CLEANING=$SCRATCH_DIR/clean + COLLECTION_ROOT="/scratch/project_462000252/zaragoza/data" + declare -A COLLECTIONS=( + ["output_wide15_filtered_sample3"]="$COLLECTION_ROOT/output_wide15_filtered_sample3" + ["output_wide15_filtered_sample12"]="$COLLECTION_ROOT/output_wide15_filtered_sample12" + ["output_CommonCrawl40_filtered_sample"]="$COLLECTION_ROOT/output_CommonCrawl40_filtered_sample" + ) + + # Where jobs should be executed. Values used in functions.sh/schedule. + export SBATCH_ACCOUNT=project_462000252 + #TODO should investigate if this variable has to be set depending on the step + # small partition is allocatable by resources + # standard partition is allocatable by node + export SBATCH_PARTITION=small + export SLURM_LOGS=$PROJ_DIR/logs + export TASKS_PER_BATCH=${TPB:-1} + + # How many resources should be allocated per slurm job. Defaults + # to as many as necessary to process all tasks in parallel. Individual + # .slurm job definitions define how many cpus should be allocated per + # task. + export SLURM_TASKS_PER_NODE=${TPN:-1} +fi diff --git a/env/init.d/lumi.sh b/env/init.d/lumi.sh new file mode 100644 index 0000000..999858f --- /dev/null +++ b/env/init.d/lumi.sh @@ -0,0 +1,21 @@ +if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then + module purge + #module purge && module load \ + # PrgEnv-cray/8.3.3 \ + # craype-x86-milan \ + # cray-python/3.9.12.1 \ + # perftools-base/22.12.0 + + # Recommended options in lumi docs: https://docs.lumi-supercomputer.eu/development/compiling/prgenv/#wrapper-and-compiler-options + # for cray compiler + # gnu compiler in lumi throws a warning when loading it, saying it is not recommended + # according to https://docs.lumi-supercomputer.eu/development/compiling/prgenv/#choosing-the-target-architecture + # instead of specifying -march, craype-x86-milan module has to be loaded for LUMI-C partitions + export CFLAGS="-O2 -funroll-loops -ffast-math" + export CXXFLAGS="-O2 -funroll-loops -ffast-math" + export IS_LUMI=true + + export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/test_env/conda_env/bin:$PATH" + + export SQUEUE_FORMAT="%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" +fi From da369d3d37d9e6204e8845e70f0a4ccb5b16278e Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 11 May 2023 13:55:05 +0300 Subject: [PATCH 04/35] Add translate-bergamot.sh script --- models/translate-bergamot.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100755 models/translate-bergamot.sh diff --git a/models/translate-bergamot.sh b/models/translate-bergamot.sh new file mode 100755 index 0000000..0b9734c --- /dev/null +++ b/models/translate-bergamot.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -euo pipefail + +MARIAN=/projappl/project_462000252/software/marian-bergamot +MODEL=$(dirname $(realpath -es ${BASH_SOURCE[0]}))/model + +$MARIAN/marian-decoder \ + -c $MODEL/config.yml \ + --cpu-threads $THREADS \ + --quiet-translation From a91e303917dad97578c7449fdd27038b896a3091 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 11 May 2023 17:15:29 +0300 Subject: [PATCH 05/35] Override env variables for DOCALIGN and BLEUALIGN --- config.d/10.lumi.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh index 449049a..10f1c7b 100644 --- a/config.d/10.lumi.sh +++ b/config.d/10.lumi.sh @@ -2,6 +2,12 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then PROJ_DIR=/projappl/project_462000252/zaragoza SCRATCH_DIR=/scratch/project_462000252/zaragoza + # Override binaries called by env variable + # they should be available in PATH for lumi + export DOCALIGN=docalign + export DOCJOIN=docjoin + export BLEUALIGN=bleualign_cpp + function bicleaner_model { local lang=$1 From 02aeadf3d4851bdb1e99b9bd9864b932bd1ed65b Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 11 May 2023 18:08:24 +0300 Subject: [PATCH 06/35] Fix bicleaner env variables setup --- config.d/10.lumi.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh index 10f1c7b..1768ece 100644 --- a/config.d/10.lumi.sh +++ b/config.d/10.lumi.sh @@ -12,7 +12,7 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then local lang=$1 export BIFIXER_PARAMS="--aggressive_dedup -q" - export BICLEANER=$PREFIX/bin/bicleaner-classify-lite + export BICLEANER=bicleaner-classify-lite export BICLEANER_THRESHOLD="0.5" export BICLEANER_PARAMS="-q" # --score_only is always supplied @@ -20,8 +20,9 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then # above don't follow this pattern very well, which is why it's not in the 09.clean code itself. export BICLEANER_MODEL=$PROJ_DIR/bicleaner-models/${TARGET_LANG%~*}-${lang%~*}/${TARGET_LANG%~*}-${lang%~*}.yaml + function bicleaner_ai_model { export BIFIXER_PARAMS="--aggressive_dedup -q" - export BICLEANER=$PREFIX/bin/bicleaner-ai-classify + export BICLEANER=bicleaner-ai-classify export BICLEANER_THRESHOLD="0.5" export BICLEANER_PARAMS="-q" export BICLEANER_MODEL=$PROJ_DIR/bicleaner-ai-models/full/${TARGET_LANG%~*}-${lang%~*}/metadata.yaml From 605b1768271683111b5ecb6d3c94ba71f1a0e6b7 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Fri, 12 May 2023 11:46:40 +0300 Subject: [PATCH 07/35] Fix typo --- config.d/10.lumi.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh index 1768ece..77919fd 100644 --- a/config.d/10.lumi.sh +++ b/config.d/10.lumi.sh @@ -19,6 +19,7 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then # Default path: here instead of in config.csd3 because path depends on $lang and the exceptions # above don't follow this pattern very well, which is why it's not in the 09.clean code itself. export BICLEANER_MODEL=$PROJ_DIR/bicleaner-models/${TARGET_LANG%~*}-${lang%~*}/${TARGET_LANG%~*}-${lang%~*}.yaml + } function bicleaner_ai_model { export BIFIXER_PARAMS="--aggressive_dedup -q" From c5d150f87142660fc6a20d3f74bb781fd8d9bef6 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Fri, 12 May 2023 11:48:16 +0300 Subject: [PATCH 08/35] Stop requesting more memory on align --- 06.align.sh | 7 ------- 1 file changed, 7 deletions(-) diff --git a/06.align.sh b/06.align.sh index 504d7ca..841b01b 100755 --- a/06.align.sh +++ b/06.align.sh @@ -45,13 +45,6 @@ declare -a OPTIONS=( -o ${SLURM_LOGS}/06.align-%A_%a.out ) -# Quick hack, should be a --option option, but functions.sh doesn't -# allow for that at the moment. Someday... -if [[ ! -z ${OOM_PROOF:-} ]]; then - OPTIONS+=(--mem-per-cpu 12G) - export BLEUALIGN_THREADS=4 -fi - collection=$1 shift From f3ca994b2e6af625982e778169dda3bcd74c74e1 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Fri, 12 May 2023 11:50:57 +0300 Subject: [PATCH 09/35] Increase mem-per-cpu on small partition --- config.d/10.lumi.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh index 77919fd..e21cf94 100644 --- a/config.d/10.lumi.sh +++ b/config.d/10.lumi.sh @@ -43,6 +43,7 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then # small partition is allocatable by resources # standard partition is allocatable by node export SBATCH_PARTITION=small + export SBATCH_MEM_PER_CPU=1750 # Maximum recommended size for LUMI export SLURM_LOGS=$PROJ_DIR/logs export TASKS_PER_BATCH=${TPB:-1} From 1477b174a95d295069e9e3220425c031cc22790a Mon Sep 17 00:00:00 2001 From: ZJaume Date: Mon, 15 May 2023 17:25:06 +0300 Subject: [PATCH 10/35] Write directly to $SCRATCH instead of node temp in align and split --- 03.split-text | 8 ++------ 05.tokenise | 9 +++------ 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/03.split-text b/03.split-text index b0f3717..44f17b8 100755 --- a/03.split-text +++ b/03.split-text @@ -12,19 +12,15 @@ echo "Processing (${SLANG}) ${BATCH}" < ${BATCH}/plain_text.gz gzip -dc \ | $SPLIT -k -q -n -d -l $SLANG -c 524288 \ | gzip -9c \ -> ${TMPDIR}/sentences.$$.gz +> ${BATCH}/sentences.$$.gz echo "Testing output" docs_pt=$(gzip -cd ${BATCH}/plain_text.gz | wc -l) -docs_st=$(gzip -cd ${TMPDIR}/sentences.$$.gz | wc -l) +docs_st=$(gzip -cd ${BATCH}/sentences.$$.gz | wc -l) echo "Expecting $docs_pt documents, found $docs_st" test $docs_pt -eq $docs_st || exit 1 -# Move in two steps. First copies it to the shared fs which -# might fail because it hits a quota. Second marks it as -# the real thing. -mv ${TMPDIR}/sentences.$$.gz ${BATCH}/sentences.$$.gz mv ${BATCH}/sentences.$$.gz ${BATCH}/sentences.gz echo "Copied result (${SLANG}) ${BATCH}" diff --git a/05.tokenise b/05.tokenise index 0c20a33..d870372 100755 --- a/05.tokenise +++ b/05.tokenise @@ -25,23 +25,20 @@ echo "Processing (${SLANG}) ${BATCH}" < ${BATCH}/${INPUT}.gz gzip -dc \ | b64filter cache bash -c tokenise \ | gzip -9c \ -> ${TMPDIR}/${OUTPUT}.$TMPSFX.gz +> ${BATCH}/${OUTPUT}.$TMPSFX.gz echo "Checking output" docs_st=$(gzip -cd ${BATCH}/${INPUT}.gz | wc -l) -docs_tk=$(gzip -cd ${TMPDIR}/${OUTPUT}.$TMPSFX.gz | wc -l) +docs_tk=$(gzip -cd ${BATCH}/${OUTPUT}.$TMPSFX.gz | wc -l) echo "Expecting $docs_st documents, found $docs_tk" test $docs_st -eq $docs_tk || exit 1 lines_st=$(gzip -cd ${BATCH}/${INPUT}.gz | base64 -d | wc -l) -lines_tk=$(gzip -cd ${TMPDIR}/${OUTPUT}.$TMPSFX.gz | base64 -d | wc -l) +lines_tk=$(gzip -cd ${BATCH}/${OUTPUT}.$TMPSFX.gz | base64 -d | wc -l) echo "Expecting $lines_st lines, found $lines_tk" test $lines_st -eq $lines_tk || exit 1 -# Two-step move because the first one might fail and leave an -# incomplete file behind, which is tricky to detect. -mv ${TMPDIR}/${OUTPUT}.$TMPSFX.gz ${BATCH}/${OUTPUT}.$TMPSFX.gz mv ${BATCH}/${OUTPUT}.$TMPSFX.gz ${BATCH}/${OUTPUT}.gz echo "Moved result (${SLANG}) ${BATCH}/${OUTPUT}.gz" From 92f0d385cc6c272a010bb68feb1814b5c4430b2a Mon Sep 17 00:00:00 2001 From: ZJaume Date: Mon, 15 May 2023 17:27:35 +0300 Subject: [PATCH 11/35] Cut out docalign scores before docjoin This fixes alignment process as docjoin is not expecting that input. --- 06.align | 2 ++ 1 file changed, 2 insertions(+) diff --git a/06.align b/06.align index a962dc8..cdcde24 100755 --- a/06.align +++ b/06.align @@ -19,6 +19,7 @@ ${DOCALIGN} -j ${DOCALIGN_THREADS:-$THREADS} --threshold 0.1 \ ${SRC_BATCH}/tokenised_${TARGET_LANG%~*}.gz \ ${REF_BATCH}/tokenised_${TARGET_LANG%~*}.gz \ | tee ${SRC_BATCH}/pairs-${TARGET_LANG%~*}-${REF_BATCH_ID}.txt \ +| cut -f2- \ | ${DOCJOIN} \ -li\ -ri\ @@ -36,5 +37,6 @@ parallel \ ${BLEUALIGN} --print-sent-hash --bleu-threshold 0.2 \ | gzip -c \ > ${SRC_BATCH}/aligned-${REF_BATCH_ID}.gz.$TMPSFX + mv ${SRC_BATCH}/aligned-${REF_BATCH_ID}.gz{.$TMPSFX,} From 96f2ccf5fb5c315c982809622853cca336b84818 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Mon, 15 May 2023 20:17:50 +0300 Subject: [PATCH 12/35] Docjoin does not expect docalign score in Bitextor 8.1 --- 07.fix | 1 - 1 file changed, 1 deletion(-) diff --git a/07.fix b/07.fix index f5215a5..89f7be6 100755 --- a/07.fix +++ b/07.fix @@ -30,7 +30,6 @@ for match in $batch/aligned-+([0-9]).gz; do echo $match 1>&2 matched_batch=$(echo $match | sed 's/.*-\([0-9]*\)\.gz/\1/') paste <(gzip -cd ${match} \ - | awk -F '\t' '{ print 0.0 "\t" $1 "\t" $2}' `# bitextor's docjoin expects a score column, which it then ignores` \ | docjoin \ -r ${target_lang_data}/${shard}/${matched_batch}/url.gz \ -l $(dirname ${match})/url.gz) `# 1,2: target & source url`\ From 9b23df16b294594fd36f2ab80780ebd942f35eb6 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Mon, 15 May 2023 20:19:13 +0300 Subject: [PATCH 13/35] Ignore the model directory of individual language pairs --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b39bdf1..e0cfafc 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ /env/src/hunspell-* filtered-terms.txt *.swp +models/??-?? From 93f95f31d7beeafcb3c4483878c0b25dca909b6e Mon Sep 17 00:00:00 2001 From: ZJaume Date: Tue, 16 May 2023 16:17:26 +0300 Subject: [PATCH 14/35] Remove pairs.txt debugging output in 06.align --- 06.align | 1 - 1 file changed, 1 deletion(-) diff --git a/06.align b/06.align index cdcde24..592621b 100755 --- a/06.align +++ b/06.align @@ -18,7 +18,6 @@ TMPSFX=${JOB_ID:-$$} ${DOCALIGN} -j ${DOCALIGN_THREADS:-$THREADS} --threshold 0.1 \ ${SRC_BATCH}/tokenised_${TARGET_LANG%~*}.gz \ ${REF_BATCH}/tokenised_${TARGET_LANG%~*}.gz \ -| tee ${SRC_BATCH}/pairs-${TARGET_LANG%~*}-${REF_BATCH_ID}.txt \ | cut -f2- \ | ${DOCJOIN} \ -li\ From 2cc6047fef67ed0e59d9c435212d45d8ab718d32 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Fri, 19 May 2023 11:32:12 +0300 Subject: [PATCH 15/35] Set path to stable bitextor env --- env/init.d/lumi.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/env/init.d/lumi.sh b/env/init.d/lumi.sh index 999858f..e012ed8 100644 --- a/env/init.d/lumi.sh +++ b/env/init.d/lumi.sh @@ -15,7 +15,8 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then export CXXFLAGS="-O2 -funroll-loops -ffast-math" export IS_LUMI=true - export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/test_env/conda_env/bin:$PATH" + #export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/test_env/conda_env/bin:$PATH" + export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/bitextor-8.1/bin:$PATH" export SQUEUE_FORMAT="%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" fi From d1cbfc1435dd732cc9aabeedba21a0c42f09a618 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Fri, 19 May 2023 11:32:52 +0300 Subject: [PATCH 16/35] Set HF cache for bicleaner-ai --- config.d/10.lumi.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh index e21cf94..43d9710 100644 --- a/config.d/10.lumi.sh +++ b/config.d/10.lumi.sh @@ -22,11 +22,13 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then } function bicleaner_ai_model { + export HUGGINGFACE_HUB_CACHE="/projappl/project_462000252/.cache/huggingface/hub" export BIFIXER_PARAMS="--aggressive_dedup -q" export BICLEANER=bicleaner-ai-classify export BICLEANER_THRESHOLD="0.5" export BICLEANER_PARAMS="-q" - export BICLEANER_MODEL=$PROJ_DIR/bicleaner-ai-models/full/${TARGET_LANG%~*}-${lang%~*}/metadata.yaml + export BICLEANER_MODEL=bitextor/bicleaner-ai-full-${TARGET_LANG%~*}-${lang%~*} + #export BICLEANER_MODEL=$PROJ_DIR/bicleaner-ai-models/full/${TARGET_LANG%~*}-${lang%~*}/metadata.yaml } export DATA_CLEANING=$SCRATCH_DIR/clean From 4cfef0b67afbb690ae508dbb290c01f88e565523 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Fri, 19 May 2023 16:05:27 +0300 Subject: [PATCH 17/35] Add install script for paracrawl cpp code --- env/setup.d/paracrawl | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 env/setup.d/paracrawl diff --git a/env/setup.d/paracrawl b/env/setup.d/paracrawl new file mode 100644 index 0000000..1d609fe --- /dev/null +++ b/env/setup.d/paracrawl @@ -0,0 +1,32 @@ +#!/bin/bash + +is-installed() { + test -x $PREFIX/bin/batch_dedupe +} + +depends() { + if [ "$IS_LUMI" == "true" ]; then + echo env + else + echo boost preprocess + fi +} + +install() { + pushd paracrawl + + mkdir -p build && cd build + + if [ "$IS_LUMI" == "true" ]; then + module load Boost/1.81.0-cpeCray-23.03 ICU + fi + + cmake .. \ + -DCMAKE_BUILD_TYPE=Release + make -j8 + cp lib/libparacrawl_util.a $PREFIX/lib/ + cp bin/* $PREFIX/bin/ + cd .. && rm -r build + + popd +} From 78bff5c6387d6e4709059b7761f2696fd3a83d7d Mon Sep 17 00:00:00 2001 From: ZJaume Date: Fri, 19 May 2023 16:17:16 +0300 Subject: [PATCH 18/35] Update bifixer and kenlm installations --- env/setup.d/bifixer | 5 +---- env/setup.d/kenlm | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/env/setup.d/bifixer b/env/setup.d/bifixer index 2ce9488..2e5d52f 100644 --- a/env/setup.d/bifixer +++ b/env/setup.d/bifixer @@ -9,8 +9,5 @@ depends() { } install() { - pip3 install -r bifixer/requirements.txt - echo "python3 $PREFIX/src/bifixer/bifixer/bifixer.py \"\$@\"" \ - > $PREFIX/bin/bifixer - chmod +x $PREFIX/bin/bifixer + pip install "bifixer==0.8.8" } diff --git a/env/setup.d/kenlm b/env/setup.d/kenlm index a791d86..c5c356d 100644 --- a/env/setup.d/kenlm +++ b/env/setup.d/kenlm @@ -17,6 +17,6 @@ install() { cmake .. -DKENLM_MAX_ORDER=7 -DCMAKE_INSTALL_PREFIX:PATH=$PREFIX make -j8 install - pip3 install .. --install-option="--max_order 7" + pip3 install .. --config-settings="--build-option=--max_order=7" popd } From e6c3d57b1bd4cdce4b31230fc4ab4957a367e293 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Fri, 19 May 2023 16:24:08 +0300 Subject: [PATCH 19/35] Override binaries locations in init not setup --- config.d/10.lumi.sh | 8 +------- env/init.d/lumi.sh | 6 ++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh index 43d9710..0d3daa7 100644 --- a/config.d/10.lumi.sh +++ b/config.d/10.lumi.sh @@ -2,12 +2,6 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then PROJ_DIR=/projappl/project_462000252/zaragoza SCRATCH_DIR=/scratch/project_462000252/zaragoza - # Override binaries called by env variable - # they should be available in PATH for lumi - export DOCALIGN=docalign - export DOCJOIN=docjoin - export BLEUALIGN=bleualign_cpp - function bicleaner_model { local lang=$1 @@ -32,7 +26,7 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then } export DATA_CLEANING=$SCRATCH_DIR/clean - COLLECTION_ROOT="/scratch/project_462000252/zaragoza/data" + COLLECTION_ROOT="$SCRATCH_DIR/data" declare -A COLLECTIONS=( ["output_wide15_filtered_sample3"]="$COLLECTION_ROOT/output_wide15_filtered_sample3" ["output_wide15_filtered_sample12"]="$COLLECTION_ROOT/output_wide15_filtered_sample12" diff --git a/env/init.d/lumi.sh b/env/init.d/lumi.sh index e012ed8..f389bf2 100644 --- a/env/init.d/lumi.sh +++ b/env/init.d/lumi.sh @@ -18,5 +18,11 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then #export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/test_env/conda_env/bin:$PATH" export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/bitextor-8.1/bin:$PATH" + # Override binaries called by env variable + # they should be available in PATH for lumi + export DOCALIGN=docalign + export DOCJOIN=docjoin + export BLEUALIGN=bleualign_cpp + export SQUEUE_FORMAT="%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" fi From 2c489e33e1c297e1c5533de4a00593b2702b43fe Mon Sep 17 00:00:00 2001 From: ZJaume Date: Fri, 19 May 2023 18:12:09 +0300 Subject: [PATCH 20/35] Use foldfilter in translate-bergamot --- models/translate-bergamot.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/models/translate-bergamot.sh b/models/translate-bergamot.sh index 0b9734c..54b024d 100755 --- a/models/translate-bergamot.sh +++ b/models/translate-bergamot.sh @@ -4,6 +4,7 @@ set -euo pipefail MARIAN=/projappl/project_462000252/software/marian-bergamot MODEL=$(dirname $(realpath -es ${BASH_SOURCE[0]}))/model +foldfilter -s -w 500 \ $MARIAN/marian-decoder \ -c $MODEL/config.yml \ --cpu-threads $THREADS \ From c5046eba06116942b0b2c6ac23c082d7b82543d3 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Tue, 6 Jun 2023 16:05:19 +0300 Subject: [PATCH 21/35] Undo executables env variables overrided in init instead of config --- config.d/10.lumi.sh | 7 +++++++ env/init.d/lumi.sh | 6 ------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh index 0d3daa7..cd4e7f9 100644 --- a/config.d/10.lumi.sh +++ b/config.d/10.lumi.sh @@ -2,6 +2,12 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then PROJ_DIR=/projappl/project_462000252/zaragoza SCRATCH_DIR=/scratch/project_462000252/zaragoza + # Override binaries called by env variable + # they should be available in PATH for lumi + export DOCALIGN=docalign + export DOCJOIN=docjoin + export BLEUALIGN=bleualign_cpp + function bicleaner_model { local lang=$1 @@ -31,6 +37,7 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then ["output_wide15_filtered_sample3"]="$COLLECTION_ROOT/output_wide15_filtered_sample3" ["output_wide15_filtered_sample12"]="$COLLECTION_ROOT/output_wide15_filtered_sample12" ["output_CommonCrawl40_filtered_sample"]="$COLLECTION_ROOT/output_CommonCrawl40_filtered_sample" + ["wide16"]="/scratch/project_465000498/hplt/data/wide00016" ) # Where jobs should be executed. Values used in functions.sh/schedule. diff --git a/env/init.d/lumi.sh b/env/init.d/lumi.sh index f389bf2..e012ed8 100644 --- a/env/init.d/lumi.sh +++ b/env/init.d/lumi.sh @@ -18,11 +18,5 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then #export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/test_env/conda_env/bin:$PATH" export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/bitextor-8.1/bin:$PATH" - # Override binaries called by env variable - # they should be available in PATH for lumi - export DOCALIGN=docalign - export DOCJOIN=docjoin - export BLEUALIGN=bleualign_cpp - export SQUEUE_FORMAT="%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" fi From de9dbfaa985984e4901e43b397cf04022d025c53 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 6 Jul 2023 16:16:57 +0300 Subject: [PATCH 22/35] Fix scoring step --- 08.score | 6 +++++- 08.score.sh | 15 +++++++++++---- config.d/10.lumi.sh | 6 +++--- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/08.score b/08.score index bdc3bea..b29c306 100755 --- a/08.score +++ b/08.score @@ -2,6 +2,11 @@ set -euo pipefail shopt -s extglob +if [ "$IS_LUMI" = true ]; then + module load CrayEnv + module load rocm/5.2.3 +fi + collection=$1 lang=$2 target_lang_data=$3 @@ -22,7 +27,6 @@ test -r $HARDRULED paste <(zcat $FIXED) <(zcat $HARDRULED) \ | cache -k 3,4 ./score-wrap.py $BICLEANER $BICLEANER_PARAMS \ --score_only \ - --processes $THREADS \ --tmp_dir $TMPDIR \ --disable_hardrules \ --disable_porn_removal \ diff --git a/08.score.sh b/08.score.sh index fa86b96..2c6138a 100755 --- a/08.score.sh +++ b/08.score.sh @@ -10,10 +10,17 @@ set -euo pipefail collection=$1 shift -export SBATCH_ACCOUNT=t2-cs119-gpu -export SBATCH_PARTITION=pascal -export SLURM_TASKS_PER_NODE=1 # No parallelism in generic.slurm plz, they'll have to share the gpu otherwise. -export SBATCH_GRES=gpu:1 +if [ "$IS_LUMI" = true ]; then + export SBATCH_PARTITION="small-g" + export SLURM_TASKS_PER_NODE=1 # No parallelism in generic.slurm plz, they'll have to share the gpu otherwise. + export SBATCH_GPUS_PER_TASK=1 + unset SBATCH_MEM_PER_CPU # If we are setting this for small partition, we don't need it for gpu jobs +else + export SBATCH_ACCOUNT=t2-cs119-gpu + export SBATCH_PARTITION=pascal + export SLURM_TASKS_PER_NODE=1 # No parallelism in generic.slurm plz, they'll have to share the gpu otherwise. + export SBATCH_GRES=gpu:1 +fi for lang in $*; do bicleaner_ai_model $lang diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh index cd4e7f9..0711eb8 100644 --- a/config.d/10.lumi.sh +++ b/config.d/10.lumi.sh @@ -26,12 +26,12 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then export BIFIXER_PARAMS="--aggressive_dedup -q" export BICLEANER=bicleaner-ai-classify export BICLEANER_THRESHOLD="0.5" - export BICLEANER_PARAMS="-q" + export BICLEANER_PARAMS="-q --batch_size 64 --block_size 100000" export BICLEANER_MODEL=bitextor/bicleaner-ai-full-${TARGET_LANG%~*}-${lang%~*} - #export BICLEANER_MODEL=$PROJ_DIR/bicleaner-ai-models/full/${TARGET_LANG%~*}-${lang%~*}/metadata.yaml + export HUGGINGFACE_HUB_CACHE="/projappl/project_462000252/.cache/huggingface/hub" } - export DATA_CLEANING=$SCRATCH_DIR/clean + export DATA_CLEANING=$SCRATCH_DIR/data/clean COLLECTION_ROOT="$SCRATCH_DIR/data" declare -A COLLECTIONS=( ["output_wide15_filtered_sample3"]="$COLLECTION_ROOT/output_wide15_filtered_sample3" From 65a709481729a67e37484b095ba365b39d5c5f14 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 6 Jul 2023 16:17:53 +0300 Subject: [PATCH 23/35] Fix 09.clean step Copy the source code of bitextor-elrc-filtering to cirrus-scripts --- 09.clean | 2 +- bitextor-elrc-filtering.py | 78 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 bitextor-elrc-filtering.py diff --git a/09.clean b/09.clean index b9ec692..0f18d98 100755 --- a/09.clean +++ b/09.clean @@ -42,7 +42,7 @@ paste <(pigz -dc $FIXED) <(pigz -dc $SCORED) `# add bicleaner score as the 9th c >(pigz -9c > $CLASSIFIED.$TMPSFX) \ >(wc -wl | sed 's/^ \+//' | tr -s ' ' '\t' > $STATS.$TMPSFX) \ | awk -F"\t" "\$9 >= ${BICLEANER_THRESHOLD}" \ -| python3 $BITEXTOR/bitextor-elrc-filtering.py -c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection" -s \ +| python3 bitextor-elrc-filtering.py -c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection" -s \ | LC_ALL=C sort -t$'\t' -k7,7 -k8,8nr \ | pigz -9c \ > $FILTERED.$TMPSFX \ diff --git a/bitextor-elrc-filtering.py b/bitextor-elrc-filtering.py new file mode 100644 index 0000000..38de231 --- /dev/null +++ b/bitextor-elrc-filtering.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +# This file is part of Bitextor. +# +# Bitextor is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Bitextor is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Bitextor. If not, see . + +import sys +import argparse + +oparser = argparse.ArgumentParser( + description="Script that reads takes a list of aligned segments, such as that produced by bitextor-alignsegments " + "script, and computes the basic ELRC quality metrics: number of tokens in lang1/lang2 and length " + "ratio.") +oparser.add_argument('aligned_seg', metavar='FILE', nargs='?', + help='File containing the set of aliged segments (if undefined, the script reads from the ' + 'standard input)', + default=None) +oparser.add_argument("-s", "--stats", help="Print stats or just output the input", action="store_true", + dest="isPrintingStats", default=False) +oparser.add_argument("-f", "--filtering", help="Filter lines according to ELRC rules (printing stats required)", + action="store_true", dest="isFiltering", default=False) +oparser.add_argument("-c", "--columns", + help="Name of columns of the input tab separated file split by comma. Default: url1,url2,seg1," + "seg2,hunalign,bicleaner", + default="url1,url2,seg1,seg2,hunalign,bicleaner") + +options = oparser.parse_args() + +if options.aligned_seg is not None: + reader = open(options.aligned_seg, "r") +else: + reader = sys.stdin + +columns = options.columns.split(',') + +for i in reader: + fields = i.split("\t") + fields[-1] = fields[-1].strip() + fieldsdict = dict() + extracolumns = [] + + for field, column in zip(fields, columns): + fieldsdict[column] = field + if options.isPrintingStats: + extracolumns = ["lengthratio", "numTokensSL", "numTokensTL"] + if len(fieldsdict["seg2"]) == 0: + lengthRatio = 0 + else: + lengthRatio = len(fieldsdict["seg1"]) * 1.0 / len(fieldsdict["seg2"]) + numTokensSL = len(fieldsdict["seg1"].split( + ' ')) # This is not the way this should be counted, we need to tokenize better first + numTokensTL = len(fieldsdict["seg2"].split( + ' ')) # This is not the way this should be counted, we need to tokenize better first + fieldsdict["lengthratio"] = str(lengthRatio) + fieldsdict["numTokensSL"] = str(numTokensSL) + fieldsdict["numTokensTL"] = str(numTokensTL) + if options.isFiltering: + if "bicleaner" in fieldsdict and fieldsdict["bicleaner"].strip() != '': + fieldsdict["bicleaner"] = str(round(float(fieldsdict["bicleaner"]), 4)) + if int(fieldsdict["numTokensSL"]) >= 200 or int(fieldsdict["numTokensTL"]) >= 200 or fieldsdict[ + "seg1"].strip() == '' or fieldsdict["seg2"].strip() == '' or float( + fieldsdict["lengthratio"]) >= 6 or float(fieldsdict["lengthratio"]) <= 0.1666: + continue + fieldstoprint = [] + for column in columns + extracolumns: + fieldstoprint.append(fieldsdict[column]) + print("\t".join(fieldstoprint)) From b6654113065d36bc1624ef4a40f040f05ea44e99 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 6 Jul 2023 16:19:45 +0300 Subject: [PATCH 24/35] Create cleaning directory if it doesn't exist --- 10.reduce-classified.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/10.reduce-classified.sh b/10.reduce-classified.sh index e588fcd..f81f613 100755 --- a/10.reduce-classified.sh +++ b/10.reduce-classified.sh @@ -20,6 +20,7 @@ for collection in $collections; do batch_lists+=( $batch_list ) done +mkdir -p $DATA_CLEANING output_file="${DATA_CLEANING}/${TARGET_LANG}-${lang}/${TARGET_LANG%~*}-${lang%~*}.${collection_hash}.classified.gz" if [ ! -f $output_file ] || ! $RETRY; then From 04e918f9f3330b75528e87826081ba1df8aa8bc5 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 6 Jul 2023 16:22:21 +0300 Subject: [PATCH 25/35] Fix 11.reduce filtered step Use cleaning dir as temp. Load boost on init.d for merge_sort to work --- 11.reduce-filtered | 4 ++-- env/init.d/lumi.sh | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/11.reduce-filtered b/11.reduce-filtered index 8caa69d..9086e3b 100755 --- a/11.reduce-filtered +++ b/11.reduce-filtered @@ -7,8 +7,8 @@ shift # Set up temp directory TMPSFX=${JOB_ID:-$$} -#TMPDIR=$(mktemp -d --tmpdir=$(dirname $output_file) --suffix=_$TMPSFX) -TMPDIR=$(mktemp -d --tmpdir=$SCRATCH --suffix=_$TMPSFX) +TMPDIR=$(mktemp -d --tmpdir=$(dirname $output_file) --suffix=_$TMPSFX) +#TMPDIR=$(mktemp -d --tmpdir=$SCRATCH --suffix=_$TMPSFX) test -d "$TMPDIR" trap "rm -rf $TMPDIR" EXIT diff --git a/env/init.d/lumi.sh b/env/init.d/lumi.sh index e012ed8..587cecf 100644 --- a/env/init.d/lumi.sh +++ b/env/init.d/lumi.sh @@ -1,5 +1,6 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then module purge + module load Boost/1.81.0-cpeCray-23.03 # This must be changed if boost version changes in env/setup.d/paracrawl #module purge && module load \ # PrgEnv-cray/8.3.3 \ # craype-x86-milan \ From 7bcdbd9a034f06740e0bbef488b1fb6640c9aa75 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 6 Jul 2023 16:23:22 +0300 Subject: [PATCH 26/35] Fix 12.reduce.tmx Use bitextor-buildtmx in cirrus-scripts. Fix bitextor import of utils. --- 12.reduce-tmx | 2 +- 12.reduce-tmx-deferred | 2 +- bitextor-buildTMX.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/12.reduce-tmx b/12.reduce-tmx index 659617f..cc07dcc 100755 --- a/12.reduce-tmx +++ b/12.reduce-tmx @@ -9,7 +9,7 @@ shift 3 filtered_input=$@ pigz -cd $filtered_input \ -| PYTHONPATH=$PREFIX/src/bitextor python3 ${SCRIPTS}/bitextor-buildTMX.py \ +| python3 bitextor-buildTMX.py \ --lang1 ${TARGET_LANG%~*} --lang2 ${lang} \ -c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection,lengthratio,numTokensSL,numTokensTL" \ --no-delete-seg \ diff --git a/12.reduce-tmx-deferred b/12.reduce-tmx-deferred index 284600f..d8baca5 100755 --- a/12.reduce-tmx-deferred +++ b/12.reduce-tmx-deferred @@ -8,7 +8,7 @@ shift 2 filtered_input=$@ pigz -cd $filtered_input \ -| PYTHONPATH=$PREFIX/src/bitextor python3 ${SCRIPTS}/bitextor-buildTMX.py \ +| python3 bitextor-buildTMX.py \ --lang1 ${TARGET_LANG%~*} --lang2 ${lang} \ -c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection,lengthratio,numTokensSL,numTokensTL" \ --dedup "bifixerhash" \ diff --git a/bitextor-buildTMX.py b/bitextor-buildTMX.py index d00d080..b85f408 100644 --- a/bitextor-buildTMX.py +++ b/bitextor-buildTMX.py @@ -39,8 +39,7 @@ import unicodedata from xml.sax.saxutils import escape -sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/utils") -from utils.common import open_xz_or_gzip_or_plain, dummy_open +from bitextor.utils.common import open_xz_or_gzip_or_plain, dummy_open def remove_control_characters(text): return "".join(ch for ch in text if unicodedata.category(ch)[0]!="C") From aa6962847efe09ad6e86fc58ded6816f58205ff1 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 18 Jul 2023 13:57:53 +0300 Subject: [PATCH 27/35] 07.fix read model data from huggingface cache --- 07.fix | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/07.fix b/07.fix index 89f7be6..d405f9b 100755 --- a/07.fix +++ b/07.fix @@ -26,6 +26,10 @@ remove_empty_lines() { awk -F"\t" '$3 != "" && $4 != "" { print }' } +# Fix bicleaner model path for non-huggingface tools (aka bicleaner-hardrules) +BICLEANER_MODEL_GIT_DIR=$HUGGINGFACE_HUB_CACHE/models--${BICLEANER_MODEL//\//--} +BICLEANER_MODEL=${BICLEANER_MODEL_GIT_DIR}/snapshots/$(cat $BICLEANER_MODEL_GIT_DIR/refs/main) + for match in $batch/aligned-+([0-9]).gz; do echo $match 1>&2 matched_batch=$(echo $match | sed 's/.*-\([0-9]*\)\.gz/\1/') @@ -52,7 +56,7 @@ done \ --target_lang $bicleaner_lang \ --scol 3 \ --tcol 4 \ - --metadata $BICLEANER_MODEL \ + --metadata $BICLEANER_MODEL/metadata.yaml \ /dev/stdin /dev/stdout \ | pigz -9c \ >$HARDRULED.$TMPSFX From 036e160469d7df112543f67ff3e64243f80e2a34 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 18 Jul 2023 13:58:54 +0300 Subject: [PATCH 28/35] Remove cache from tokenize pipeline --- 05.tokenise | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/05.tokenise b/05.tokenise index d870372..bf459bc 100755 --- a/05.tokenise +++ b/05.tokenise @@ -23,7 +23,7 @@ export -f tokenise echo "Processing (${SLANG}) ${BATCH}" < ${BATCH}/${INPUT}.gz gzip -dc \ -| b64filter cache bash -c tokenise \ +| b64filter bash -c tokenise \ | gzip -9c \ > ${BATCH}/${OUTPUT}.$TMPSFX.gz From 9fbe192e024c483d5ef8e0f1a9c404b17fb0e1fd Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 18 Jul 2023 14:00:32 +0300 Subject: [PATCH 29/35] Add --will-cite where necessary --- 06.align | 1 + generic.slurm | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/06.align b/06.align index 592621b..e486e3d 100755 --- a/06.align +++ b/06.align @@ -27,6 +27,7 @@ ${DOCALIGN} -j ${DOCALIGN_THREADS:-$THREADS} --threshold 0.1 \ -l ${SRC_BATCH}/sentences_${TARGET_LANG%~*}.gz\ | /usr/bin/time -f '{"task":"bleualign", "pair":'"$PAIR_FORMAT"', "time":'"$TIME_FORMAT"'}' \ parallel \ + --will-cite \ --tmpdir=$TMPDIR \ -j${BLEUALIGN_THREADS:-$THREADS} \ --halt 2 \ diff --git a/generic.slurm b/generic.slurm index 3c591b6..df1bd33 100755 --- a/generic.slurm +++ b/generic.slurm @@ -60,10 +60,11 @@ fi GROUP_END=$(( $TASKS_PER_BATCH * $SLURM_ARRAY_TASK_ID )) GROUP_START=$(( $GROUP_END - $TASKS_PER_BATCH )) -echo "Processing $GROUP_START to $GROUP_END with $THREADS threads" +echo "Processing $GROUP_START to $GROUP_END in ${SLURM_TASKS_PER_NODE} parallel jobs each with $THREADS threads" awk "NR > $GROUP_START && NR <= $GROUP_END" $BATCHES \ | parallel \ + --will-cite \ -j${SLURM_TASKS_PER_NODE} \ --line-buffer \ --colsep $'\t' \ From 7821e8d4fdcc1ba9469057be4d2efccb87653297 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 18 Jul 2023 14:00:58 +0300 Subject: [PATCH 30/35] Fix bergamot model configuration path --- models/translate-bergamot.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/translate-bergamot.sh b/models/translate-bergamot.sh index 54b024d..16082e7 100755 --- a/models/translate-bergamot.sh +++ b/models/translate-bergamot.sh @@ -6,6 +6,6 @@ MODEL=$(dirname $(realpath -es ${BASH_SOURCE[0]}))/model foldfilter -s -w 500 \ $MARIAN/marian-decoder \ - -c $MODEL/config.yml \ + -c $MODEL/config.intgemm8bit.alphas.yml \ --cpu-threads $THREADS \ --quiet-translation From d822f608c4ae5e1b0ead8c8c445d0c10e6b50c62 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 18 Jul 2023 14:01:30 +0300 Subject: [PATCH 31/35] Add LUMI base module --- env/init.d/lumi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/env/init.d/lumi.sh b/env/init.d/lumi.sh index 587cecf..d176bda 100644 --- a/env/init.d/lumi.sh +++ b/env/init.d/lumi.sh @@ -1,6 +1,6 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then module purge - module load Boost/1.81.0-cpeCray-23.03 # This must be changed if boost version changes in env/setup.d/paracrawl + module load LUMI/23.03 Boost/1.81.0-cpeCray-23.03 # This must be changed if boost version changes in env/setup.d/paracrawl #module purge && module load \ # PrgEnv-cray/8.3.3 \ # craype-x86-milan \ From 5bc79715f86f30bf4beb13bc520afb166482cc08 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 26 Sep 2024 18:18:36 +0300 Subject: [PATCH 32/35] Use singularity container --- .gitignore | 2 ++ Dockerfile | 42 ++++++++++++++++++++++++++++++++++++++++++ cirrus-scripts.def | 2 ++ config.d/10.lumi.sh | 17 ++++++++--------- env/init.d/lumi.sh | 5 +++-- env/src/bifixer | 2 +- env/src/bitextor | 2 +- generic.slurm | 7 ++++++- pipeline.sh | 16 ++++++++-------- 9 files changed, 73 insertions(+), 22 deletions(-) create mode 100644 Dockerfile create mode 100644 cirrus-scripts.def diff --git a/.gitignore b/.gitignore index e0cfafc..ccd8205 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,5 @@ filtered-terms.txt *.swp models/??-?? +*.sif +core diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1c27bb4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,42 @@ +FROM bitextor/bitextor:8.3 + +COPY cirrus-scripts /cirrus-scripts +WORKDIR /cirrus-scripts + +RUN git submodule update --init env/src/preprocess/ +RUN mkdir /cirrus-scripts/env/src/paracrawl/build && \ + cd /cirrus-scripts/env/src/paracrawl/build && \ + cmake .. && \ + make -j8 merge_sort && \ + cp bin/merge_sort /usr/local/bin/ + +COPY GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB /mkl-key.pub +RUN mkdir /etc/apt/keyrings +RUN gpg --dearmor -o /etc/apt/keyrings/mkl.gpg /mkl-key.pub && rm /mkl-key.pub +RUN echo "deb [signed-by=/etc/apt/keyrings/mkl.gpg] https://apt.repos.intel.com/mkl all main" > /etc/apt/sources.list.d/intel-mkl.list +RUN apt-get update && apt-get install -yy intel-mkl-64bit-2020.0-088 + +# Compile Marian CPU from Bergamot +RUN git clone https://github.com/browsermt/marian-dev /opt/marian-bergamot +WORKDIR /opt/marian-bergamot +RUN git checkout 2be8344fcf2776fb43a7376284067164674cbfaf +WORKDIR /opt/marian-bergamot/build +RUN cmake .. -DUSE_SENTENCEPIECE=on -DCOMPILE_CUDA=off -DUSE_FBGEMM=on +RUN make -j24 + +RUN pip uninstall -y tensorflow keras +RUN pip install tensorflow-rocm==2.12.1.600 + +RUN apt-get remove -yy intel-mkl-64bit-2020.0-088 build-essential && apt-get -yy autoremove && \ + rm -Rf /opt/marian-bergamot/build/src && \ + rm -Rf /opt/marian-bergamot/src && \ + rm -Rf /opt/marian-bergamot/build/local && \ + rm -Rf /opt/marian-bergamot/build/libmarian.a && \ + strip /opt/marian-bergamot/build/marian* && \ + strip /opt/marian-bergamot/build/spm* + +RUN apt-get install -y locales +RUN locale-gen en_US.UTF-8 +ENV LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8 + +ENTRYPOINT ["/bin/bash"] diff --git a/cirrus-scripts.def b/cirrus-scripts.def new file mode 100644 index 0000000..761f8c8 --- /dev/null +++ b/cirrus-scripts.def @@ -0,0 +1,2 @@ +bootstrap: docker-daemon +from: cirrus-scripts:latest diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh index 0711eb8..dc38820 100644 --- a/config.d/10.lumi.sh +++ b/config.d/10.lumi.sh @@ -1,6 +1,6 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then - PROJ_DIR=/projappl/project_462000252/zaragoza - SCRATCH_DIR=/scratch/project_462000252/zaragoza + PROJ_DIR=/projappl/project_465000498/zaragoza + SCRATCH_DIR=/scratch/project_465000498/zaragoza/cirrus-scripts-data # Override binaries called by env variable # they should be available in PATH for lumi @@ -22,32 +22,31 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then } function bicleaner_ai_model { - export HUGGINGFACE_HUB_CACHE="/projappl/project_462000252/.cache/huggingface/hub" + export HUGGINGFACE_HUB_CACHE="/projappl/project_465000498/.cache/huggingface/hub" export BIFIXER_PARAMS="--aggressive_dedup -q" export BICLEANER=bicleaner-ai-classify export BICLEANER_THRESHOLD="0.5" export BICLEANER_PARAMS="-q --batch_size 64 --block_size 100000" export BICLEANER_MODEL=bitextor/bicleaner-ai-full-${TARGET_LANG%~*}-${lang%~*} - export HUGGINGFACE_HUB_CACHE="/projappl/project_462000252/.cache/huggingface/hub" } export DATA_CLEANING=$SCRATCH_DIR/data/clean - COLLECTION_ROOT="$SCRATCH_DIR/data" + export COLLECTION_ROOT="$SCRATCH_DIR/data" declare -A COLLECTIONS=( - ["output_wide15_filtered_sample3"]="$COLLECTION_ROOT/output_wide15_filtered_sample3" + ["sample3"]="$COLLECTION_ROOT/output_wide15_filtered_sample3" ["output_wide15_filtered_sample12"]="$COLLECTION_ROOT/output_wide15_filtered_sample12" ["output_CommonCrawl40_filtered_sample"]="$COLLECTION_ROOT/output_CommonCrawl40_filtered_sample" ["wide16"]="/scratch/project_465000498/hplt/data/wide00016" ) # Where jobs should be executed. Values used in functions.sh/schedule. - export SBATCH_ACCOUNT=project_462000252 + export SBATCH_ACCOUNT=project_465000498 #TODO should investigate if this variable has to be set depending on the step # small partition is allocatable by resources # standard partition is allocatable by node - export SBATCH_PARTITION=small + export SBATCH_PARTITION=debug export SBATCH_MEM_PER_CPU=1750 # Maximum recommended size for LUMI - export SLURM_LOGS=$PROJ_DIR/logs + export SLURM_LOGS=$SCRATCH_DIR/logs export TASKS_PER_BATCH=${TPB:-1} # How many resources should be allocated per slurm job. Defaults diff --git a/env/init.d/lumi.sh b/env/init.d/lumi.sh index d176bda..bd37dc7 100644 --- a/env/init.d/lumi.sh +++ b/env/init.d/lumi.sh @@ -1,6 +1,7 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then module purge - module load LUMI/23.03 Boost/1.81.0-cpeCray-23.03 # This must be changed if boost version changes in env/setup.d/paracrawl + module load LUMI/23.09 + #module load Boost/1.81.0-cpeCray-23.03 # This must be changed if boost version changes in env/setup.d/paracrawl #module purge && module load \ # PrgEnv-cray/8.3.3 \ # craype-x86-milan \ @@ -17,7 +18,7 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then export IS_LUMI=true #export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/test_env/conda_env/bin:$PATH" - export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/bitextor-8.1/bin:$PATH" + #export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/bitextor-8.1/bin:$PATH" export SQUEUE_FORMAT="%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" fi diff --git a/env/src/bifixer b/env/src/bifixer index 1ca2f1b..7367178 160000 --- a/env/src/bifixer +++ b/env/src/bifixer @@ -1 +1 @@ -Subproject commit 1ca2f1bd7f844a13110471a6d03b5264627a1626 +Subproject commit 7367178785a75d1d92d23b43b6a70b1fe5dc3a58 diff --git a/env/src/bitextor b/env/src/bitextor index eaa85a6..845f81e 160000 --- a/env/src/bitextor +++ b/env/src/bitextor @@ -1 +1 @@ -Subproject commit eaa85a64b344b1673d8a19440fcea41c56f638dc +Subproject commit 845f81e94b86d6864d87144042ea1c5bab920dd6 diff --git a/generic.slurm b/generic.slurm index df1bd33..a2b7632 100755 --- a/generic.slurm +++ b/generic.slurm @@ -63,7 +63,12 @@ GROUP_START=$(( $GROUP_END - $TASKS_PER_BATCH )) echo "Processing $GROUP_START to $GROUP_END in ${SLURM_TASKS_PER_NODE} parallel jobs each with $THREADS threads" awk "NR > $GROUP_START && NR <= $GROUP_END" $BATCHES \ -| parallel \ +| singularity exec \ + -B $(pwd -P) \ + -B $COLLECTION_ROOT \ + --pwd $(pwd -P) \ + cirrus-scripts.sif \ +parallel \ --will-cite \ -j${SLURM_TASKS_PER_NODE} \ --line-buffer \ diff --git a/pipeline.sh b/pipeline.sh index b83b1d3..e022fc5 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -78,14 +78,14 @@ main() { local collections=($(./collections.sh -gwb)) - case $lang in - ca|eu|gl|oc) - export TARGET_LANG=es - ;; - en|ga|hr|is|nn|no|nb) - collections=(${collections[@]} gwb) - ;; - esac + #case $lang in + # ca|eu|gl|oc) + # export TARGET_LANG=es + # ;; + # en|ga|hr|is|nn|no|nb) + # collections=(${collections[@]} gwb) + # ;; + #esac for collection in ${collections[@]}; do job_id=$(schedule_batch_jobs $collection $lang) From 0511f8a80161422978544d30999117636e107b2c Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 26 Sep 2024 18:19:38 +0300 Subject: [PATCH 33/35] Use loomchild segmenter and text.gz --- 03.split-text | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/03.split-text b/03.split-text index 44f17b8..bce98c6 100755 --- a/03.split-text +++ b/03.split-text @@ -5,18 +5,16 @@ ulimit -n 16384 SLANG="$1" BATCH="$2" -SPLIT="perl $KPU/moses/ems/support/split-sentences.perl" - echo "Processing (${SLANG}) ${BATCH}" -< ${BATCH}/plain_text.gz gzip -dc \ -| $SPLIT -k -q -n -d -l $SLANG -c 524288 \ +< ${BATCH}/text.gz gzip -dc \ +| py-segment -l $SLANG \ | gzip -9c \ > ${BATCH}/sentences.$$.gz echo "Testing output" -docs_pt=$(gzip -cd ${BATCH}/plain_text.gz | wc -l) +docs_pt=$(gzip -cd ${BATCH}/text.gz | wc -l) docs_st=$(gzip -cd ${BATCH}/sentences.$$.gz | wc -l) echo "Expecting $docs_pt documents, found $docs_st" test $docs_pt -eq $docs_st || exit 1 From d4e0acfd7726fd13b5c20e33eb74cb2c41189328 Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 26 Sep 2024 18:20:35 +0300 Subject: [PATCH 34/35] Adapt translate-bergamot to marian container path and use config.yml Always enable max-length-crop in case user forgets to add it. Will bring more stability. --- models/translate-bergamot.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/models/translate-bergamot.sh b/models/translate-bergamot.sh index 16082e7..aec8778 100755 --- a/models/translate-bergamot.sh +++ b/models/translate-bergamot.sh @@ -1,11 +1,12 @@ #!/bin/bash set -euo pipefail -MARIAN=/projappl/project_462000252/software/marian-bergamot +MARIAN=/opt/marian-bergamot/build MODEL=$(dirname $(realpath -es ${BASH_SOURCE[0]}))/model foldfilter -s -w 500 \ $MARIAN/marian-decoder \ - -c $MODEL/config.intgemm8bit.alphas.yml \ + -c $MODEL/config.yml \ --cpu-threads $THREADS \ - --quiet-translation + --quiet-translation \ + --max-length-crop From 615ef60e836fb6d4810668fa7450f9beefeae8ad Mon Sep 17 00:00:00 2001 From: ZJaume Date: Thu, 26 Sep 2024 18:32:02 +0300 Subject: [PATCH 35/35] Use tokeniser.perl inside the container --- config.d/10.lumi.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh index dc38820..84c34f7 100644 --- a/config.d/10.lumi.sh +++ b/config.d/10.lumi.sh @@ -7,6 +7,7 @@ if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then export DOCALIGN=docalign export DOCJOIN=docjoin export BLEUALIGN=bleualign_cpp + export TOKENISER="/home/docker/bitextor/third_party/preprocess/moses/tokenizer/tokenizer.perl" function bicleaner_model { local lang=$1