- Download SAMSum dialogue from here or execute:
curl https://arxiv.org/src/1911.12237v2/anc/corpus.7z --output samsum.7z
- Uncompress samsum.7z
- Download fairseq toolkit:
git clone https://github.com/pytorch/fairseq.git
- Add fairseq to python path:
export PYTHONPATH="${PYTHONPATH}:/FAIRSEQ_PATH
- Create a directory to save prepared data:
mkdir PATH_TO_FAIRSEQ/examples/bart/samsum
- Run prepare_data.py:
python prepare_data.py --path_samsum PATH_TO_SAMSUM --path_samsum_bart PATH_SAMSUM_IN_BART
where PATH_TO_SAMSUM is the directory containig SAMSum dataset and PATH_SAMSUM_IN_BARTis the created directory in the previous step.
- Move to bart directory:
cd PATH_TO_FAIRSEQ/examples/bart
- Run BPE tokenization by executing the following command:
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
TASK=samsum
for SPLIT in train val
do
for LANG in source target
do
python -m examples.roberta.multiprocessing_bpe_encoder \
--encoder-json encoder.json \
--vocab-bpe vocab.bpe \
--inputs "$TASK/$SPLIT.$LANG" \
--outputs "$TASK/$SPLIT.bpe.$LANG" \
--workers 1 \
--keep-empty;
done
done
- Binarize data by executing:
python -m preprocess \
--source-lang "source" \
--target-lang "target" \
--trainpref "${TASK}/train.bpe" \
--validpref "${TASK}/val.bpe" \
--destdir "${TASK}-bin/" \
--workers 1 \
--srcdict dict.txt \
--tgtdict dict.txt;
- Download Bart Pretrained model:
wget https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz
tar xvzf bart.large.tar.gz
- Finetune the model by executing:
TOTAL_NUM_UPDATES=20000
WARMUP_UPDATES=100
LR=3e-05
MAX_TOKENS=1048
UPDATE_FREQ=4
BART_PATH=bart.large.cnn/model.pt
CUDA_VISIBLE_DEVICES=0
time python -m train samsum-bin \
--restore-file $BART_PATH \
--max-tokens $MAX_TOKENS \
--task translation \
--source-lang source --target-lang target \
--truncate-source \
--layernorm-embedding \
--share-all-embeddings \
--share-decoder-input-output-embed \
--reset-optimizer --reset-dataloader --reset-meters \
--required-batch-size-multiple 1 \
--arch bart_large \
--criterion label_smoothed_cross_entropy \
--label-smoothing 0.1 \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \
--clip-norm 0.1 \
--lr-scheduler polynomial_decay --lr $LR \
--total-num-update $TOTAL_NUM_UPDATES \
--warmup-updates $WARMUP_UPDATES \
--fp16 --update-freq $UPDATE_FREQ \
--skip-invalid-size-inputs-valid-test \
--find-unused-parameters \
--batch-size 1 \
--save-dir samsum-bin \
--memory-efficient-fp16 \
--max-epoch 3 \
--disable-validation \
- Generate summaries from a checkpoint by running generate_summaries.py
python generate_summaries.py --checkpoint PATH_TO_CHECKPOINT --test_source PATH_TO_TEST_SOURCE --summaries_file PATH_TO_OUTPUT