diff --git a/.gitignore b/.gitignore index 933109b..ccd8205 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,7 @@ /env/src/recode-* /env/src/hunspell-* filtered-terms.txt +*.swp +models/??-?? +*.sif +core diff --git a/03.split-text b/03.split-text index b0f3717..bce98c6 100755 --- a/03.split-text +++ b/03.split-text @@ -5,26 +5,20 @@ ulimit -n 16384 SLANG="$1" BATCH="$2" -SPLIT="perl $KPU/moses/ems/support/split-sentences.perl" - echo "Processing (${SLANG}) ${BATCH}" -< ${BATCH}/plain_text.gz gzip -dc \ -| $SPLIT -k -q -n -d -l $SLANG -c 524288 \ +< ${BATCH}/text.gz gzip -dc \ +| py-segment -l $SLANG \ | gzip -9c \ -> ${TMPDIR}/sentences.$$.gz +> ${BATCH}/sentences.$$.gz echo "Testing output" -docs_pt=$(gzip -cd ${BATCH}/plain_text.gz | wc -l) -docs_st=$(gzip -cd ${TMPDIR}/sentences.$$.gz | wc -l) +docs_pt=$(gzip -cd ${BATCH}/text.gz | wc -l) +docs_st=$(gzip -cd ${BATCH}/sentences.$$.gz | wc -l) echo "Expecting $docs_pt documents, found $docs_st" test $docs_pt -eq $docs_st || exit 1 -# Move in two steps. First copies it to the shared fs which -# might fail because it hits a quota. Second marks it as -# the real thing. -mv ${TMPDIR}/sentences.$$.gz ${BATCH}/sentences.$$.gz mv ${BATCH}/sentences.$$.gz ${BATCH}/sentences.gz echo "Copied result (${SLANG}) ${BATCH}" diff --git a/05.tokenise b/05.tokenise index 0c20a33..bf459bc 100755 --- a/05.tokenise +++ b/05.tokenise @@ -23,25 +23,22 @@ export -f tokenise echo "Processing (${SLANG}) ${BATCH}" < ${BATCH}/${INPUT}.gz gzip -dc \ -| b64filter cache bash -c tokenise \ +| b64filter bash -c tokenise \ | gzip -9c \ -> ${TMPDIR}/${OUTPUT}.$TMPSFX.gz +> ${BATCH}/${OUTPUT}.$TMPSFX.gz echo "Checking output" docs_st=$(gzip -cd ${BATCH}/${INPUT}.gz | wc -l) -docs_tk=$(gzip -cd ${TMPDIR}/${OUTPUT}.$TMPSFX.gz | wc -l) +docs_tk=$(gzip -cd ${BATCH}/${OUTPUT}.$TMPSFX.gz | wc -l) echo "Expecting $docs_st documents, found $docs_tk" test $docs_st -eq $docs_tk || exit 1 lines_st=$(gzip -cd ${BATCH}/${INPUT}.gz | base64 -d | wc -l) -lines_tk=$(gzip -cd ${TMPDIR}/${OUTPUT}.$TMPSFX.gz | base64 -d | wc -l) +lines_tk=$(gzip -cd ${BATCH}/${OUTPUT}.$TMPSFX.gz | base64 -d | wc -l) echo "Expecting $lines_st lines, found $lines_tk" test $lines_st -eq $lines_tk || exit 1 -# Two-step move because the first one might fail and leave an -# incomplete file behind, which is tricky to detect. -mv ${TMPDIR}/${OUTPUT}.$TMPSFX.gz ${BATCH}/${OUTPUT}.$TMPSFX.gz mv ${BATCH}/${OUTPUT}.$TMPSFX.gz ${BATCH}/${OUTPUT}.gz echo "Moved result (${SLANG}) ${BATCH}/${OUTPUT}.gz" diff --git a/06.align b/06.align index a962dc8..e486e3d 100755 --- a/06.align +++ b/06.align @@ -18,7 +18,7 @@ TMPSFX=${JOB_ID:-$$} ${DOCALIGN} -j ${DOCALIGN_THREADS:-$THREADS} --threshold 0.1 \ ${SRC_BATCH}/tokenised_${TARGET_LANG%~*}.gz \ ${REF_BATCH}/tokenised_${TARGET_LANG%~*}.gz \ -| tee ${SRC_BATCH}/pairs-${TARGET_LANG%~*}-${REF_BATCH_ID}.txt \ +| cut -f2- \ | ${DOCJOIN} \ -li\ -ri\ @@ -27,6 +27,7 @@ ${DOCALIGN} -j ${DOCALIGN_THREADS:-$THREADS} --threshold 0.1 \ -l ${SRC_BATCH}/sentences_${TARGET_LANG%~*}.gz\ | /usr/bin/time -f '{"task":"bleualign", "pair":'"$PAIR_FORMAT"', "time":'"$TIME_FORMAT"'}' \ parallel \ + --will-cite \ --tmpdir=$TMPDIR \ -j${BLEUALIGN_THREADS:-$THREADS} \ --halt 2 \ @@ -36,5 +37,6 @@ parallel \ ${BLEUALIGN} --print-sent-hash --bleu-threshold 0.2 \ | gzip -c \ > ${SRC_BATCH}/aligned-${REF_BATCH_ID}.gz.$TMPSFX + mv ${SRC_BATCH}/aligned-${REF_BATCH_ID}.gz{.$TMPSFX,} diff --git a/06.align.sh b/06.align.sh index 504d7ca..841b01b 100755 --- a/06.align.sh +++ b/06.align.sh @@ -45,13 +45,6 @@ declare -a OPTIONS=( -o ${SLURM_LOGS}/06.align-%A_%a.out ) -# Quick hack, should be a --option option, but functions.sh doesn't -# allow for that at the moment. Someday... -if [[ ! -z ${OOM_PROOF:-} ]]; then - OPTIONS+=(--mem-per-cpu 12G) - export BLEUALIGN_THREADS=4 -fi - collection=$1 shift diff --git a/07.fix b/07.fix index f5215a5..d405f9b 100755 --- a/07.fix +++ b/07.fix @@ -26,11 +26,14 @@ remove_empty_lines() { awk -F"\t" '$3 != "" && $4 != "" { print }' } +# Fix bicleaner model path for non-huggingface tools (aka bicleaner-hardrules) +BICLEANER_MODEL_GIT_DIR=$HUGGINGFACE_HUB_CACHE/models--${BICLEANER_MODEL//\//--} +BICLEANER_MODEL=${BICLEANER_MODEL_GIT_DIR}/snapshots/$(cat $BICLEANER_MODEL_GIT_DIR/refs/main) + for match in $batch/aligned-+([0-9]).gz; do echo $match 1>&2 matched_batch=$(echo $match | sed 's/.*-\([0-9]*\)\.gz/\1/') paste <(gzip -cd ${match} \ - | awk -F '\t' '{ print 0.0 "\t" $1 "\t" $2}' `# bitextor's docjoin expects a score column, which it then ignores` \ | docjoin \ -r ${target_lang_data}/${shard}/${matched_batch}/url.gz \ -l $(dirname ${match})/url.gz) `# 1,2: target & source url`\ @@ -53,7 +56,7 @@ done \ --target_lang $bicleaner_lang \ --scol 3 \ --tcol 4 \ - --metadata $BICLEANER_MODEL \ + --metadata $BICLEANER_MODEL/metadata.yaml \ /dev/stdin /dev/stdout \ | pigz -9c \ >$HARDRULED.$TMPSFX diff --git a/08.score b/08.score index bdc3bea..b29c306 100755 --- a/08.score +++ b/08.score @@ -2,6 +2,11 @@ set -euo pipefail shopt -s extglob +if [ "$IS_LUMI" = true ]; then + module load CrayEnv + module load rocm/5.2.3 +fi + collection=$1 lang=$2 target_lang_data=$3 @@ -22,7 +27,6 @@ test -r $HARDRULED paste <(zcat $FIXED) <(zcat $HARDRULED) \ | cache -k 3,4 ./score-wrap.py $BICLEANER $BICLEANER_PARAMS \ --score_only \ - --processes $THREADS \ --tmp_dir $TMPDIR \ --disable_hardrules \ --disable_porn_removal \ diff --git a/08.score.sh b/08.score.sh index fa86b96..2c6138a 100755 --- a/08.score.sh +++ b/08.score.sh @@ -10,10 +10,17 @@ set -euo pipefail collection=$1 shift -export SBATCH_ACCOUNT=t2-cs119-gpu -export SBATCH_PARTITION=pascal -export SLURM_TASKS_PER_NODE=1 # No parallelism in generic.slurm plz, they'll have to share the gpu otherwise. -export SBATCH_GRES=gpu:1 +if [ "$IS_LUMI" = true ]; then + export SBATCH_PARTITION="small-g" + export SLURM_TASKS_PER_NODE=1 # No parallelism in generic.slurm plz, they'll have to share the gpu otherwise. + export SBATCH_GPUS_PER_TASK=1 + unset SBATCH_MEM_PER_CPU # If we are setting this for small partition, we don't need it for gpu jobs +else + export SBATCH_ACCOUNT=t2-cs119-gpu + export SBATCH_PARTITION=pascal + export SLURM_TASKS_PER_NODE=1 # No parallelism in generic.slurm plz, they'll have to share the gpu otherwise. + export SBATCH_GRES=gpu:1 +fi for lang in $*; do bicleaner_ai_model $lang diff --git a/09.clean b/09.clean index b9ec692..0f18d98 100755 --- a/09.clean +++ b/09.clean @@ -42,7 +42,7 @@ paste <(pigz -dc $FIXED) <(pigz -dc $SCORED) `# add bicleaner score as the 9th c >(pigz -9c > $CLASSIFIED.$TMPSFX) \ >(wc -wl | sed 's/^ \+//' | tr -s ' ' '\t' > $STATS.$TMPSFX) \ | awk -F"\t" "\$9 >= ${BICLEANER_THRESHOLD}" \ -| python3 $BITEXTOR/bitextor-elrc-filtering.py -c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection" -s \ +| python3 bitextor-elrc-filtering.py -c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection" -s \ | LC_ALL=C sort -t$'\t' -k7,7 -k8,8nr \ | pigz -9c \ > $FILTERED.$TMPSFX \ diff --git a/10.reduce-classified.sh b/10.reduce-classified.sh index e588fcd..f81f613 100755 --- a/10.reduce-classified.sh +++ b/10.reduce-classified.sh @@ -20,6 +20,7 @@ for collection in $collections; do batch_lists+=( $batch_list ) done +mkdir -p $DATA_CLEANING output_file="${DATA_CLEANING}/${TARGET_LANG}-${lang}/${TARGET_LANG%~*}-${lang%~*}.${collection_hash}.classified.gz" if [ ! -f $output_file ] || ! $RETRY; then diff --git a/11.reduce-filtered b/11.reduce-filtered index 8caa69d..9086e3b 100755 --- a/11.reduce-filtered +++ b/11.reduce-filtered @@ -7,8 +7,8 @@ shift # Set up temp directory TMPSFX=${JOB_ID:-$$} -#TMPDIR=$(mktemp -d --tmpdir=$(dirname $output_file) --suffix=_$TMPSFX) -TMPDIR=$(mktemp -d --tmpdir=$SCRATCH --suffix=_$TMPSFX) +TMPDIR=$(mktemp -d --tmpdir=$(dirname $output_file) --suffix=_$TMPSFX) +#TMPDIR=$(mktemp -d --tmpdir=$SCRATCH --suffix=_$TMPSFX) test -d "$TMPDIR" trap "rm -rf $TMPDIR" EXIT diff --git a/12.reduce-tmx b/12.reduce-tmx index 659617f..cc07dcc 100755 --- a/12.reduce-tmx +++ b/12.reduce-tmx @@ -9,7 +9,7 @@ shift 3 filtered_input=$@ pigz -cd $filtered_input \ -| PYTHONPATH=$PREFIX/src/bitextor python3 ${SCRIPTS}/bitextor-buildTMX.py \ +| python3 bitextor-buildTMX.py \ --lang1 ${TARGET_LANG%~*} --lang2 ${lang} \ -c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection,lengthratio,numTokensSL,numTokensTL" \ --no-delete-seg \ diff --git a/12.reduce-tmx-deferred b/12.reduce-tmx-deferred index 284600f..d8baca5 100755 --- a/12.reduce-tmx-deferred +++ b/12.reduce-tmx-deferred @@ -8,7 +8,7 @@ shift 2 filtered_input=$@ pigz -cd $filtered_input \ -| PYTHONPATH=$PREFIX/src/bitextor python3 ${SCRIPTS}/bitextor-buildTMX.py \ +| python3 bitextor-buildTMX.py \ --lang1 ${TARGET_LANG%~*} --lang2 ${lang} \ -c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection,lengthratio,numTokensSL,numTokensTL" \ --dedup "bifixerhash" \ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1c27bb4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,42 @@ +FROM bitextor/bitextor:8.3 + +COPY cirrus-scripts /cirrus-scripts +WORKDIR /cirrus-scripts + +RUN git submodule update --init env/src/preprocess/ +RUN mkdir /cirrus-scripts/env/src/paracrawl/build && \ + cd /cirrus-scripts/env/src/paracrawl/build && \ + cmake .. && \ + make -j8 merge_sort && \ + cp bin/merge_sort /usr/local/bin/ + +COPY GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB /mkl-key.pub +RUN mkdir /etc/apt/keyrings +RUN gpg --dearmor -o /etc/apt/keyrings/mkl.gpg /mkl-key.pub && rm /mkl-key.pub +RUN echo "deb [signed-by=/etc/apt/keyrings/mkl.gpg] https://apt.repos.intel.com/mkl all main" > /etc/apt/sources.list.d/intel-mkl.list +RUN apt-get update && apt-get install -yy intel-mkl-64bit-2020.0-088 + +# Compile Marian CPU from Bergamot +RUN git clone https://github.com/browsermt/marian-dev /opt/marian-bergamot +WORKDIR /opt/marian-bergamot +RUN git checkout 2be8344fcf2776fb43a7376284067164674cbfaf +WORKDIR /opt/marian-bergamot/build +RUN cmake .. -DUSE_SENTENCEPIECE=on -DCOMPILE_CUDA=off -DUSE_FBGEMM=on +RUN make -j24 + +RUN pip uninstall -y tensorflow keras +RUN pip install tensorflow-rocm==2.12.1.600 + +RUN apt-get remove -yy intel-mkl-64bit-2020.0-088 build-essential && apt-get -yy autoremove && \ + rm -Rf /opt/marian-bergamot/build/src && \ + rm -Rf /opt/marian-bergamot/src && \ + rm -Rf /opt/marian-bergamot/build/local && \ + rm -Rf /opt/marian-bergamot/build/libmarian.a && \ + strip /opt/marian-bergamot/build/marian* && \ + strip /opt/marian-bergamot/build/spm* + +RUN apt-get install -y locales +RUN locale-gen en_US.UTF-8 +ENV LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8 + +ENTRYPOINT ["/bin/bash"] diff --git a/bitextor-buildTMX.py b/bitextor-buildTMX.py index d00d080..b85f408 100644 --- a/bitextor-buildTMX.py +++ b/bitextor-buildTMX.py @@ -39,8 +39,7 @@ import unicodedata from xml.sax.saxutils import escape -sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/utils") -from utils.common import open_xz_or_gzip_or_plain, dummy_open +from bitextor.utils.common import open_xz_or_gzip_or_plain, dummy_open def remove_control_characters(text): return "".join(ch for ch in text if unicodedata.category(ch)[0]!="C") diff --git a/bitextor-elrc-filtering.py b/bitextor-elrc-filtering.py new file mode 100644 index 0000000..38de231 --- /dev/null +++ b/bitextor-elrc-filtering.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +# This file is part of Bitextor. +# +# Bitextor is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Bitextor is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Bitextor. If not, see . + +import sys +import argparse + +oparser = argparse.ArgumentParser( + description="Script that reads takes a list of aligned segments, such as that produced by bitextor-alignsegments " + "script, and computes the basic ELRC quality metrics: number of tokens in lang1/lang2 and length " + "ratio.") +oparser.add_argument('aligned_seg', metavar='FILE', nargs='?', + help='File containing the set of aliged segments (if undefined, the script reads from the ' + 'standard input)', + default=None) +oparser.add_argument("-s", "--stats", help="Print stats or just output the input", action="store_true", + dest="isPrintingStats", default=False) +oparser.add_argument("-f", "--filtering", help="Filter lines according to ELRC rules (printing stats required)", + action="store_true", dest="isFiltering", default=False) +oparser.add_argument("-c", "--columns", + help="Name of columns of the input tab separated file split by comma. Default: url1,url2,seg1," + "seg2,hunalign,bicleaner", + default="url1,url2,seg1,seg2,hunalign,bicleaner") + +options = oparser.parse_args() + +if options.aligned_seg is not None: + reader = open(options.aligned_seg, "r") +else: + reader = sys.stdin + +columns = options.columns.split(',') + +for i in reader: + fields = i.split("\t") + fields[-1] = fields[-1].strip() + fieldsdict = dict() + extracolumns = [] + + for field, column in zip(fields, columns): + fieldsdict[column] = field + if options.isPrintingStats: + extracolumns = ["lengthratio", "numTokensSL", "numTokensTL"] + if len(fieldsdict["seg2"]) == 0: + lengthRatio = 0 + else: + lengthRatio = len(fieldsdict["seg1"]) * 1.0 / len(fieldsdict["seg2"]) + numTokensSL = len(fieldsdict["seg1"].split( + ' ')) # This is not the way this should be counted, we need to tokenize better first + numTokensTL = len(fieldsdict["seg2"].split( + ' ')) # This is not the way this should be counted, we need to tokenize better first + fieldsdict["lengthratio"] = str(lengthRatio) + fieldsdict["numTokensSL"] = str(numTokensSL) + fieldsdict["numTokensTL"] = str(numTokensTL) + if options.isFiltering: + if "bicleaner" in fieldsdict and fieldsdict["bicleaner"].strip() != '': + fieldsdict["bicleaner"] = str(round(float(fieldsdict["bicleaner"]), 4)) + if int(fieldsdict["numTokensSL"]) >= 200 or int(fieldsdict["numTokensTL"]) >= 200 or fieldsdict[ + "seg1"].strip() == '' or fieldsdict["seg2"].strip() == '' or float( + fieldsdict["lengthratio"]) >= 6 or float(fieldsdict["lengthratio"]) <= 0.1666: + continue + fieldstoprint = [] + for column in columns + extracolumns: + fieldstoprint.append(fieldsdict[column]) + print("\t".join(fieldstoprint)) diff --git a/cirrus-scripts.def b/cirrus-scripts.def new file mode 100644 index 0000000..761f8c8 --- /dev/null +++ b/cirrus-scripts.def @@ -0,0 +1,2 @@ +bootstrap: docker-daemon +from: cirrus-scripts:latest diff --git a/config.d/10.lumi.sh b/config.d/10.lumi.sh new file mode 100644 index 0000000..84c34f7 --- /dev/null +++ b/config.d/10.lumi.sh @@ -0,0 +1,58 @@ +if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then + PROJ_DIR=/projappl/project_465000498/zaragoza + SCRATCH_DIR=/scratch/project_465000498/zaragoza/cirrus-scripts-data + + # Override binaries called by env variable + # they should be available in PATH for lumi + export DOCALIGN=docalign + export DOCJOIN=docjoin + export BLEUALIGN=bleualign_cpp + export TOKENISER="/home/docker/bitextor/third_party/preprocess/moses/tokenizer/tokenizer.perl" + + function bicleaner_model { + local lang=$1 + + export BIFIXER_PARAMS="--aggressive_dedup -q" + export BICLEANER=bicleaner-classify-lite + export BICLEANER_THRESHOLD="0.5" + export BICLEANER_PARAMS="-q" # --score_only is always supplied + + # Default path: here instead of in config.csd3 because path depends on $lang and the exceptions + # above don't follow this pattern very well, which is why it's not in the 09.clean code itself. + export BICLEANER_MODEL=$PROJ_DIR/bicleaner-models/${TARGET_LANG%~*}-${lang%~*}/${TARGET_LANG%~*}-${lang%~*}.yaml + } + + function bicleaner_ai_model { + export HUGGINGFACE_HUB_CACHE="/projappl/project_465000498/.cache/huggingface/hub" + export BIFIXER_PARAMS="--aggressive_dedup -q" + export BICLEANER=bicleaner-ai-classify + export BICLEANER_THRESHOLD="0.5" + export BICLEANER_PARAMS="-q --batch_size 64 --block_size 100000" + export BICLEANER_MODEL=bitextor/bicleaner-ai-full-${TARGET_LANG%~*}-${lang%~*} + } + + export DATA_CLEANING=$SCRATCH_DIR/data/clean + export COLLECTION_ROOT="$SCRATCH_DIR/data" + declare -A COLLECTIONS=( + ["sample3"]="$COLLECTION_ROOT/output_wide15_filtered_sample3" + ["output_wide15_filtered_sample12"]="$COLLECTION_ROOT/output_wide15_filtered_sample12" + ["output_CommonCrawl40_filtered_sample"]="$COLLECTION_ROOT/output_CommonCrawl40_filtered_sample" + ["wide16"]="/scratch/project_465000498/hplt/data/wide00016" + ) + + # Where jobs should be executed. Values used in functions.sh/schedule. + export SBATCH_ACCOUNT=project_465000498 + #TODO should investigate if this variable has to be set depending on the step + # small partition is allocatable by resources + # standard partition is allocatable by node + export SBATCH_PARTITION=debug + export SBATCH_MEM_PER_CPU=1750 # Maximum recommended size for LUMI + export SLURM_LOGS=$SCRATCH_DIR/logs + export TASKS_PER_BATCH=${TPB:-1} + + # How many resources should be allocated per slurm job. Defaults + # to as many as necessary to process all tasks in parallel. Individual + # .slurm job definitions define how many cpus should be allocated per + # task. + export SLURM_TASKS_PER_NODE=${TPN:-1} +fi diff --git a/env/init.d/lumi.sh b/env/init.d/lumi.sh new file mode 100644 index 0000000..bd37dc7 --- /dev/null +++ b/env/init.d/lumi.sh @@ -0,0 +1,24 @@ +if [[ $(hostname -A) =~ "uan"[0-9][0-9] ]]; then + module purge + module load LUMI/23.09 + #module load Boost/1.81.0-cpeCray-23.03 # This must be changed if boost version changes in env/setup.d/paracrawl + #module purge && module load \ + # PrgEnv-cray/8.3.3 \ + # craype-x86-milan \ + # cray-python/3.9.12.1 \ + # perftools-base/22.12.0 + + # Recommended options in lumi docs: https://docs.lumi-supercomputer.eu/development/compiling/prgenv/#wrapper-and-compiler-options + # for cray compiler + # gnu compiler in lumi throws a warning when loading it, saying it is not recommended + # according to https://docs.lumi-supercomputer.eu/development/compiling/prgenv/#choosing-the-target-architecture + # instead of specifying -march, craype-x86-milan module has to be loaded for LUMI-C partitions + export CFLAGS="-O2 -funroll-loops -ffast-math" + export CXXFLAGS="-O2 -funroll-loops -ffast-math" + export IS_LUMI=true + + #export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/test_env/conda_env/bin:$PATH" + #export PATH="/pfs/lustrep1/projappl/project_462000252/zaragoza/bitextor-8.1/bin:$PATH" + + export SQUEUE_FORMAT="%.18i %.9P %.20j %.8u %.2t %.10M %.6D %R" +fi diff --git a/env/setup.d/bifixer b/env/setup.d/bifixer index 2ce9488..2e5d52f 100644 --- a/env/setup.d/bifixer +++ b/env/setup.d/bifixer @@ -9,8 +9,5 @@ depends() { } install() { - pip3 install -r bifixer/requirements.txt - echo "python3 $PREFIX/src/bifixer/bifixer/bifixer.py \"\$@\"" \ - > $PREFIX/bin/bifixer - chmod +x $PREFIX/bin/bifixer + pip install "bifixer==0.8.8" } diff --git a/env/setup.d/giashard b/env/setup.d/giashard index 5a3fb5d..1003ffd 100644 --- a/env/setup.d/giashard +++ b/env/setup.d/giashard @@ -4,8 +4,12 @@ is-installed() { test -x $GOPATH/bin/giashard } +depends() { + echo go +} + install() { - go get -u github.com/paracrawl/giashard/cmd/giashard + go install github.com/paracrawl/giashard/cmd/giashard@latest } diff --git a/env/setup.d/go b/env/setup.d/go new file mode 100644 index 0000000..bdfced6 --- /dev/null +++ b/env/setup.d/go @@ -0,0 +1,13 @@ +#!/bin/bash + +is-installed() { + test -x $PREFIX/go/bin/go +} + +install() { + pushd .. + wget -O go.linux-amd64.tgz https://go.dev/dl/go1.20.3.linux-amd64.tar.gz + tar xvf go.linux-amd64.tgz + rm go.linux-amd64.tgz + popd +} diff --git a/env/setup.d/kenlm b/env/setup.d/kenlm index a791d86..c5c356d 100644 --- a/env/setup.d/kenlm +++ b/env/setup.d/kenlm @@ -17,6 +17,6 @@ install() { cmake .. -DKENLM_MAX_ORDER=7 -DCMAKE_INSTALL_PREFIX:PATH=$PREFIX make -j8 install - pip3 install .. --install-option="--max_order 7" + pip3 install .. --config-settings="--build-option=--max_order=7" popd } diff --git a/env/setup.d/paracrawl b/env/setup.d/paracrawl new file mode 100644 index 0000000..1d609fe --- /dev/null +++ b/env/setup.d/paracrawl @@ -0,0 +1,32 @@ +#!/bin/bash + +is-installed() { + test -x $PREFIX/bin/batch_dedupe +} + +depends() { + if [ "$IS_LUMI" == "true" ]; then + echo env + else + echo boost preprocess + fi +} + +install() { + pushd paracrawl + + mkdir -p build && cd build + + if [ "$IS_LUMI" == "true" ]; then + module load Boost/1.81.0-cpeCray-23.03 ICU + fi + + cmake .. \ + -DCMAKE_BUILD_TYPE=Release + make -j8 + cp lib/libparacrawl_util.a $PREFIX/lib/ + cp bin/* $PREFIX/bin/ + cd .. && rm -r build + + popd +} diff --git a/env/src/bifixer b/env/src/bifixer index 1ca2f1b..7367178 160000 --- a/env/src/bifixer +++ b/env/src/bifixer @@ -1 +1 @@ -Subproject commit 1ca2f1bd7f844a13110471a6d03b5264627a1626 +Subproject commit 7367178785a75d1d92d23b43b6a70b1fe5dc3a58 diff --git a/env/src/bitextor b/env/src/bitextor index eaa85a6..845f81e 160000 --- a/env/src/bitextor +++ b/env/src/bitextor @@ -1 +1 @@ -Subproject commit eaa85a64b344b1673d8a19440fcea41c56f638dc +Subproject commit 845f81e94b86d6864d87144042ea1c5bab920dd6 diff --git a/generic.slurm b/generic.slurm index 3c591b6..a2b7632 100755 --- a/generic.slurm +++ b/generic.slurm @@ -60,10 +60,16 @@ fi GROUP_END=$(( $TASKS_PER_BATCH * $SLURM_ARRAY_TASK_ID )) GROUP_START=$(( $GROUP_END - $TASKS_PER_BATCH )) -echo "Processing $GROUP_START to $GROUP_END with $THREADS threads" +echo "Processing $GROUP_START to $GROUP_END in ${SLURM_TASKS_PER_NODE} parallel jobs each with $THREADS threads" awk "NR > $GROUP_START && NR <= $GROUP_END" $BATCHES \ -| parallel \ +| singularity exec \ + -B $(pwd -P) \ + -B $COLLECTION_ROOT \ + --pwd $(pwd -P) \ + cirrus-scripts.sif \ +parallel \ + --will-cite \ -j${SLURM_TASKS_PER_NODE} \ --line-buffer \ --colsep $'\t' \ diff --git a/models/translate-bergamot.sh b/models/translate-bergamot.sh new file mode 100755 index 0000000..aec8778 --- /dev/null +++ b/models/translate-bergamot.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -euo pipefail + +MARIAN=/opt/marian-bergamot/build +MODEL=$(dirname $(realpath -es ${BASH_SOURCE[0]}))/model + +foldfilter -s -w 500 \ +$MARIAN/marian-decoder \ + -c $MODEL/config.yml \ + --cpu-threads $THREADS \ + --quiet-translation \ + --max-length-crop diff --git a/pipeline.sh b/pipeline.sh index b83b1d3..e022fc5 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -78,14 +78,14 @@ main() { local collections=($(./collections.sh -gwb)) - case $lang in - ca|eu|gl|oc) - export TARGET_LANG=es - ;; - en|ga|hr|is|nn|no|nb) - collections=(${collections[@]} gwb) - ;; - esac + #case $lang in + # ca|eu|gl|oc) + # export TARGET_LANG=es + # ;; + # en|ga|hr|is|nn|no|nb) + # collections=(${collections[@]} gwb) + # ;; + #esac for collection in ${collections[@]}; do job_id=$(schedule_batch_jobs $collection $lang)