build/eval/gen-card scripts for fsmt (#7155)
* build/eval/gen-card scripts for fsmt * adjust for model renames
This commit is contained in:
62
scripts/fsmt/convert-allenai-wmt16.sh
Executable file
62
scripts/fsmt/convert-allenai-wmt16.sh
Executable file
@@ -0,0 +1,62 @@
|
|||||||
|
#/usr/bin/env bash
|
||||||
|
|
||||||
|
# this script acquires data and converts it to fsmt model
|
||||||
|
# it covers:
|
||||||
|
# - allenai/wmt16-en-de-dist-12-1
|
||||||
|
# - allenai/wmt16-en-de-dist-6-1
|
||||||
|
# - allenai/wmt16-en-de-12-1
|
||||||
|
|
||||||
|
# this script needs to be run from the top level of the transformers repo
|
||||||
|
if [ ! -d "src/transformers" ]; then
|
||||||
|
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir data
|
||||||
|
|
||||||
|
# get data (run once)
|
||||||
|
|
||||||
|
cd data
|
||||||
|
gdown 'https://drive.google.com/uc?id=1x_G2cjvM1nW5hjAB8-vWxRqtQTlmIaQU'
|
||||||
|
gdown 'https://drive.google.com/uc?id=1oA2aqZlVNj5FarxBlNXEHpBS4lRetTzU'
|
||||||
|
gdown 'https://drive.google.com/uc?id=1Wup2D318QYBFPW_NKI1mfP_hXOfmUI9r'
|
||||||
|
tar -xvzf trans_ende_12-1_0.2.tar.gz
|
||||||
|
tar -xvzf trans_ende-dist_12-1_0.2.tar.gz
|
||||||
|
tar -xvzf trans_ende-dist_6-1_0.2.tar.gz
|
||||||
|
gdown 'https://drive.google.com/uc?id=1mNufoynJ9-Zy1kJh2TA_lHm2squji0i9'
|
||||||
|
gdown 'https://drive.google.com/uc?id=1iO7um-HWoNoRKDtw27YUSgyeubn9uXqj'
|
||||||
|
tar -xvzf wmt16.en-de.deep-shallow.dist.tar.gz
|
||||||
|
tar -xvzf wmt16.en-de.deep-shallow.tar.gz
|
||||||
|
cp wmt16.en-de.deep-shallow/data-bin/dict.*.txt trans_ende_12-1_0.2
|
||||||
|
cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_12-1_0.2
|
||||||
|
cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_6-1_0.2
|
||||||
|
cp wmt16.en-de.deep-shallow/bpecodes trans_ende_12-1_0.2
|
||||||
|
cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_12-1_0.2
|
||||||
|
cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_6-1_0.2
|
||||||
|
cd -
|
||||||
|
|
||||||
|
# run conversions and uploads
|
||||||
|
|
||||||
|
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-12-1
|
||||||
|
|
||||||
|
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_6-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-6-1
|
||||||
|
|
||||||
|
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-12-1
|
||||||
|
|
||||||
|
|
||||||
|
# upload
|
||||||
|
cd data
|
||||||
|
transformers-cli upload -y wmt16-en-de-dist-12-1
|
||||||
|
transformers-cli upload -y wmt16-en-de-dist-6-1
|
||||||
|
transformers-cli upload -y wmt16-en-de-12-1
|
||||||
|
cd -
|
||||||
|
|
||||||
|
|
||||||
|
# if updating just small files and not the large models, here is a script to generate the right commands:
|
||||||
|
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
|
||||||
|
# add/remove files as needed
|
||||||
|
|
||||||
|
# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload
|
||||||
|
# So the only way to start using the new model sooner is either:
|
||||||
|
# 1. download it to a local path and use that path as model_name
|
||||||
|
# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere
|
||||||
50
scripts/fsmt/convert-allenai-wmt19.sh
Executable file
50
scripts/fsmt/convert-allenai-wmt19.sh
Executable file
@@ -0,0 +1,50 @@
|
|||||||
|
#/usr/bin/env bash
|
||||||
|
|
||||||
|
# this script acquires data and converts it to fsmt model
|
||||||
|
# it covers:
|
||||||
|
# - allenai/wmt19-de-en-6-6-base
|
||||||
|
# - allenai/wmt19-de-en-6-6-big
|
||||||
|
|
||||||
|
# this script needs to be run from the top level of the transformers repo
|
||||||
|
if [ ! -d "src/transformers" ]; then
|
||||||
|
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir data
|
||||||
|
|
||||||
|
# get data (run once)
|
||||||
|
|
||||||
|
cd data
|
||||||
|
gdown 'https://drive.google.com/uc?id=1j6z9fYdlUyOYsh7KJoumRlr1yHczxR5T'
|
||||||
|
gdown 'https://drive.google.com/uc?id=1yT7ZjqfvUYOBXvMjeY8uGRHQFWoSo8Q5'
|
||||||
|
gdown 'https://drive.google.com/uc?id=15gAzHeRUCs-QV8vHeTReMPEh1j8excNE'
|
||||||
|
tar -xvzf wmt19.de-en.tar.gz
|
||||||
|
tar -xvzf wmt19_deen_base_dr0.1_1.tar.gz
|
||||||
|
tar -xvzf wmt19_deen_big_dr0.1_2.tar.gz
|
||||||
|
cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_base_dr0.1_1
|
||||||
|
cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_big_dr0.1_2
|
||||||
|
cd -
|
||||||
|
|
||||||
|
# run conversions and uploads
|
||||||
|
|
||||||
|
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_base_dr0.1_1/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-base
|
||||||
|
|
||||||
|
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_big_dr0.1_2/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-big
|
||||||
|
|
||||||
|
|
||||||
|
# upload
|
||||||
|
cd data
|
||||||
|
transformers-cli upload -y wmt19-de-en-6-6-base
|
||||||
|
transformers-cli upload -y wmt19-de-en-6-6-big
|
||||||
|
cd -
|
||||||
|
|
||||||
|
|
||||||
|
# if updating just small files and not the large models, here is a script to generate the right commands:
|
||||||
|
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
|
||||||
|
# add/remove files as needed
|
||||||
|
|
||||||
|
# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload
|
||||||
|
# So the only way to start using the new model sooner is either:
|
||||||
|
# 1. download it to a local path and use that path as model_name
|
||||||
|
# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere
|
||||||
61
scripts/fsmt/convert-facebook-wmt19.sh
Executable file
61
scripts/fsmt/convert-facebook-wmt19.sh
Executable file
@@ -0,0 +1,61 @@
|
|||||||
|
#/usr/bin/env bash
|
||||||
|
|
||||||
|
# this script acquires data and converts it to fsmt model
|
||||||
|
# it covers:
|
||||||
|
# - facebook/wmt19-ru-en
|
||||||
|
# - facebook/wmt19-en-ru
|
||||||
|
# - facebook/wmt19-de-en
|
||||||
|
# - facebook/wmt19-en-de
|
||||||
|
|
||||||
|
# this script needs to be run from the top level of the transformers repo
|
||||||
|
if [ ! -d "src/transformers" ]; then
|
||||||
|
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir data
|
||||||
|
|
||||||
|
# get data (run once)
|
||||||
|
|
||||||
|
cd data
|
||||||
|
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz
|
||||||
|
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz
|
||||||
|
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz
|
||||||
|
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz
|
||||||
|
tar -xvzf wmt19.en-de.joined-dict.ensemble.tar.gz
|
||||||
|
tar -xvzf wmt19.de-en.joined-dict.ensemble.tar.gz
|
||||||
|
tar -xvzf wmt19.en-ru.ensemble.tar.gz
|
||||||
|
tar -xvzf wmt19.ru-en.ensemble.tar.gz
|
||||||
|
cd -
|
||||||
|
|
||||||
|
# run conversions and uploads
|
||||||
|
|
||||||
|
export PAIR=ru-en
|
||||||
|
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
|
||||||
|
|
||||||
|
export PAIR=en-ru
|
||||||
|
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
|
||||||
|
|
||||||
|
export PAIR=de-en
|
||||||
|
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
|
||||||
|
|
||||||
|
export PAIR=en-de
|
||||||
|
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
|
||||||
|
|
||||||
|
|
||||||
|
# upload
|
||||||
|
cd data
|
||||||
|
transformers-cli upload -y wmt19-ru-en
|
||||||
|
transformers-cli upload -y wmt19-en-ru
|
||||||
|
transformers-cli upload -y wmt19-de-en
|
||||||
|
transformers-cli upload -y wmt19-en-de
|
||||||
|
cd -
|
||||||
|
|
||||||
|
# if updating just small files and not the large models, here is a script to generate the right commands:
|
||||||
|
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for map { "wmt19-$_" } ("en-ru", "ru-en", "de-en", "en-de")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
|
||||||
|
# add/remove files as needed
|
||||||
|
|
||||||
|
# Caching note: Unfortunately due to CDN caching the uploaded model may be unavailable for up to 24hs after upload
|
||||||
|
# So the only way to start using the new model sooner is either:
|
||||||
|
# 1. download it to a local path and use that path as model_name
|
||||||
|
# 2. make sure you use: from_pretrained(..., use_cdn=False) everywhere
|
||||||
66
scripts/fsmt/eval-allenai-wmt16.sh
Executable file
66
scripts/fsmt/eval-allenai-wmt16.sh
Executable file
@@ -0,0 +1,66 @@
|
|||||||
|
#/usr/bin/env bash
|
||||||
|
|
||||||
|
# this script evals the following fsmt models
|
||||||
|
# it covers:
|
||||||
|
# - allenai/wmt16-en-de-dist-12-1
|
||||||
|
# - allenai/wmt16-en-de-dist-6-1
|
||||||
|
# - allenai/wmt16-en-de-12-1
|
||||||
|
|
||||||
|
# this script needs to be run from the top level of the transformers repo
|
||||||
|
if [ ! -d "src/transformers" ]; then
|
||||||
|
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
|
||||||
|
|
||||||
|
### Normal eval ###
|
||||||
|
|
||||||
|
export PAIR=en-de
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=64
|
||||||
|
export NUM_BEAMS=5
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
|
||||||
|
MODEL_PATH=allenai/wmt16-en-de-dist-12-1
|
||||||
|
echo $PAIR $MODEL_PATH
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
|
||||||
|
MODEL_PATH=allenai/wmt16-en-de-dist-6-1
|
||||||
|
echo $PAIR $MODEL_PATH
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
|
||||||
|
MODEL_PATH=allenai/wmt16-en-de-12-1
|
||||||
|
echo $PAIR $MODEL_PATH
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Searching hparams eval ###
|
||||||
|
|
||||||
|
|
||||||
|
export PAIR=en-de
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=32
|
||||||
|
export NUM_BEAMS=5
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
|
||||||
|
MODEL_PATH=allenai/wmt16-en-de-dist-12-1
|
||||||
|
echo $PAIR $MODEL_PATH
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
||||||
|
|
||||||
|
|
||||||
|
MODEL_PATH=allenai/wmt16-en-de-dist-6-1
|
||||||
|
echo $PAIR $MODEL_PATH
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
||||||
|
|
||||||
|
|
||||||
|
MODEL_PATH=allenai/wmt16-en-de-12-1
|
||||||
|
echo $PAIR $MODEL_PATH
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
||||||
54
scripts/fsmt/eval-allenai-wmt19.sh
Executable file
54
scripts/fsmt/eval-allenai-wmt19.sh
Executable file
@@ -0,0 +1,54 @@
|
|||||||
|
#/usr/bin/env bash
|
||||||
|
|
||||||
|
# this script evals the following fsmt models
|
||||||
|
# it covers:
|
||||||
|
# - allenai/wmt19-de-en-6-6-base
|
||||||
|
# - allenai/wmt19-de-en-6-6-big
|
||||||
|
|
||||||
|
# this script needs to be run from the top level of the transformers repo
|
||||||
|
if [ ! -d "src/transformers" ]; then
|
||||||
|
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
|
||||||
|
|
||||||
|
### Normal eval ###
|
||||||
|
|
||||||
|
export PAIR=de-en
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=64
|
||||||
|
export NUM_BEAMS=5
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
|
||||||
|
MODEL_PATH=allenai/wmt19-de-en-6-6-base
|
||||||
|
echo $PAIR $MODEL_PATH
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
|
||||||
|
MODEL_PATH=allenai/wmt19-de-en-6-6-big
|
||||||
|
echo $PAIR $MODEL_PATH
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Searching hparams eval ###
|
||||||
|
|
||||||
|
export PAIR=de-en
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=16
|
||||||
|
export NUM_BEAMS=5
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
|
||||||
|
MODEL_PATH=allenai/wmt19-de-en-6-6-base
|
||||||
|
echo $PAIR $MODEL_PATH
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
||||||
|
|
||||||
|
MODEL_PATH=allenai/wmt19-de-en-6-6-big
|
||||||
|
echo $PAIR $MODEL_PATH
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
||||||
148
scripts/fsmt/eval-facebook-wmt19.sh
Executable file
148
scripts/fsmt/eval-facebook-wmt19.sh
Executable file
@@ -0,0 +1,148 @@
|
|||||||
|
#/usr/bin/env bash
|
||||||
|
|
||||||
|
# this script evals the following fsmt models
|
||||||
|
# it covers:
|
||||||
|
# - facebook/wmt19-ru-en
|
||||||
|
# - facebook/wmt19-en-ru
|
||||||
|
# - facebook/wmt19-de-en
|
||||||
|
# - facebook/wmt19-en-de
|
||||||
|
|
||||||
|
|
||||||
|
# this script needs to be run from the top level of the transformers repo
|
||||||
|
if [ ! -d "src/transformers" ]; then
|
||||||
|
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
|
||||||
|
|
||||||
|
### a short estimate version for quick testing ###
|
||||||
|
|
||||||
|
export PAIR=en-ru
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=8
|
||||||
|
export NUM_BEAMS=8
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src | head -10 > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref | head -10 > $DATA_DIR/val.target
|
||||||
|
echo $PAIR
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Normal eval ###
|
||||||
|
|
||||||
|
# ru-en
|
||||||
|
|
||||||
|
export PAIR=ru-en
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=8
|
||||||
|
export NUM_BEAMS=50
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
|
||||||
|
|
||||||
|
# (target BLEU: 41.3 http://matrix.statmt.org/matrix/output/1907?run_id=6937)
|
||||||
|
|
||||||
|
|
||||||
|
# en-ru
|
||||||
|
|
||||||
|
export PAIR=en-ru
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=8
|
||||||
|
export NUM_BEAMS=50
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
echo $PAIR
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
|
||||||
|
# (target BLEU: 36.4 http://matrix.statmt.org/matrix/output/1914?score_id=37605)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# en-de
|
||||||
|
|
||||||
|
export PAIR=en-de
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=8
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
echo $PAIR
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
|
||||||
|
# (target BLEU: 43.1 http://matrix.statmt.org/matrix/output/1909?run_id=6862)
|
||||||
|
|
||||||
|
|
||||||
|
# de-en
|
||||||
|
|
||||||
|
export PAIR=de-en
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=8
|
||||||
|
export NUM_BEAMS=50
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
echo $PAIR
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
|
||||||
|
# (target BLEU: 42.3 http://matrix.statmt.org/matrix/output/1902?run_id=6750)
|
||||||
|
|
||||||
|
|
||||||
|
### Searching hparams eval ###
|
||||||
|
|
||||||
|
# en-ru
|
||||||
|
|
||||||
|
export PAIR=ru-en
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=32
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
||||||
|
|
||||||
|
|
||||||
|
# en-ru
|
||||||
|
|
||||||
|
export PAIR=en-ru
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=16
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
|
||||||
|
|
||||||
|
# en-de
|
||||||
|
|
||||||
|
export PAIR=en-de
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=16
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
|
||||||
|
|
||||||
|
# de-en
|
||||||
|
|
||||||
|
export PAIR=de-en
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=16
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
|
||||||
134
scripts/fsmt/gen-card-allenai-wmt16.py
Executable file
134
scripts/fsmt/gen-card-allenai-wmt16.py
Executable file
@@ -0,0 +1,134 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Usage:
|
||||||
|
# ./gen-card-allenai-wmt16.py
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
|
||||||
|
|
||||||
|
texts = {
|
||||||
|
"en": "Machine learning is great, isn't it?",
|
||||||
|
"ru": "Машинное обучение - это здорово, не так ли?",
|
||||||
|
"de": "Maschinelles Lernen ist großartig, nicht wahr?",
|
||||||
|
}
|
||||||
|
|
||||||
|
# BLUE scores as follows:
|
||||||
|
# "pair": [fairseq, transformers]
|
||||||
|
scores = {
|
||||||
|
"wmt16-en-de-dist-12-1": [28.3, 27.52],
|
||||||
|
"wmt16-en-de-dist-6-1": [27.4, 27.11],
|
||||||
|
"wmt16-en-de-12-1": [26.9, 25.75],
|
||||||
|
}
|
||||||
|
pair = f"{src_lang}-{tgt_lang}"
|
||||||
|
|
||||||
|
readme = f"""
|
||||||
|
---
|
||||||
|
|
||||||
|
language: {src_lang}, {tgt_lang}
|
||||||
|
thumbnail:
|
||||||
|
tags:
|
||||||
|
- translation
|
||||||
|
- wmt16
|
||||||
|
- allenai
|
||||||
|
license: Apache 2.0
|
||||||
|
datasets:
|
||||||
|
- http://www.statmt.org/wmt16/ ([test-set](http://matrix.statmt.org/test_sets/newstest2016.tgz?1504722372))
|
||||||
|
|
||||||
|
metrics:
|
||||||
|
- http://www.statmt.org/wmt16/metrics-task.html
|
||||||
|
---
|
||||||
|
|
||||||
|
# FSMT
|
||||||
|
|
||||||
|
## Model description
|
||||||
|
|
||||||
|
This is a ported version of fairseq-based [wmt16 transformer](https://github.com/jungokasai/deep-shallow/) for {src_lang}-{tgt_lang}.
|
||||||
|
|
||||||
|
For more details, please, see [Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation](https://arxiv.org/abs/2006.10369).
|
||||||
|
|
||||||
|
All 3 models are available:
|
||||||
|
|
||||||
|
* [wmt16-en-de-dist-12-1](https://huggingface.co/allenai/wmt16-en-de-dist-12-1)
|
||||||
|
* [wmt16-en-de-dist-6-1](https://huggingface.co/allenai/wmt16-en-de-dist-6-1)
|
||||||
|
* [wmt16-en-de-12-1](https://huggingface.co/allenai/wmt16-en-de-12-1)
|
||||||
|
|
||||||
|
```
|
||||||
|
@misc{{kasai2020deep,
|
||||||
|
title={{Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation}},
|
||||||
|
author={{Jungo Kasai and Nikolaos Pappas and Hao Peng and James Cross and Noah A. Smith}},
|
||||||
|
year={{2020}},
|
||||||
|
eprint={{2006.10369}},
|
||||||
|
archivePrefix={{arXiv}},
|
||||||
|
primaryClass={{cs.CL}}
|
||||||
|
}}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Intended uses & limitations
|
||||||
|
|
||||||
|
#### How to use
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers.tokenization_fsmt import FSMTTokenizer
|
||||||
|
from transformers.modeling_fsmt import FSMTForConditionalGeneration
|
||||||
|
mname = "allenai/{model_name}"
|
||||||
|
tokenizer = FSMTTokenizer.from_pretrained(mname)
|
||||||
|
model = FSMTForConditionalGeneration.from_pretrained(mname)
|
||||||
|
|
||||||
|
input = "{texts[src_lang]}"
|
||||||
|
input_ids = tokenizer.encode(input, return_tensors="pt")
|
||||||
|
outputs = model.generate(input_ids)
|
||||||
|
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||||
|
print(decoded) # {texts[tgt_lang]}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Limitations and bias
|
||||||
|
|
||||||
|
|
||||||
|
## Training data
|
||||||
|
|
||||||
|
Pretrained weights were left identical to the original model released by allenai. For more details, please, see the [paper](https://arxiv.org/abs/2006.10369).
|
||||||
|
|
||||||
|
## Eval results
|
||||||
|
|
||||||
|
Here are the BLEU scores:
|
||||||
|
|
||||||
|
model | fairseq | transformers
|
||||||
|
-------|---------|----------
|
||||||
|
{model_name} | {scores[model_name][0]} | {scores[model_name][1]}
|
||||||
|
|
||||||
|
The score is slightly below the score reported in the paper, as the researchers don't use `sacrebleu` and measure the score on tokenized outputs. `transformers` score was measured using `sacrebleu` on detokenized outputs.
|
||||||
|
|
||||||
|
The score was calculated using this code:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/huggingface/transformers
|
||||||
|
cd transformers
|
||||||
|
export PAIR={pair}
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=8
|
||||||
|
export NUM_BEAMS=5
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt16 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt16 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
echo $PAIR
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
```
|
||||||
|
|
||||||
|
"""
|
||||||
|
model_card_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
path = os.path.join(model_card_dir, "README.md")
|
||||||
|
print(f"Generating {path}")
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(readme)
|
||||||
|
|
||||||
|
# make sure we are under the root of the project
|
||||||
|
repo_dir = Path(__file__).resolve().parent.parent.parent
|
||||||
|
model_cards_dir = repo_dir / "model_cards"
|
||||||
|
|
||||||
|
for model_name in ["wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1"]:
|
||||||
|
model_card_dir = model_cards_dir / "allenai" / model_name
|
||||||
|
write_model_card(model_card_dir, src_lang="en", tgt_lang="de", model_name=model_name)
|
||||||
116
scripts/fsmt/gen-card-allenai-wmt19.py
Executable file
116
scripts/fsmt/gen-card-allenai-wmt19.py
Executable file
@@ -0,0 +1,116 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Usage:
|
||||||
|
# ./gen-card-allenai-wmt19.py
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
|
||||||
|
|
||||||
|
texts = {
|
||||||
|
"en": "Machine learning is great, isn't it?",
|
||||||
|
"ru": "Машинное обучение - это здорово, не так ли?",
|
||||||
|
"de": "Maschinelles Lernen ist großartig, nicht wahr?",
|
||||||
|
}
|
||||||
|
|
||||||
|
# BLUE scores as follows:
|
||||||
|
# "pair": [fairseq, transformers]
|
||||||
|
scores = {
|
||||||
|
"wmt19-de-en-6-6-base": [0, 38.37],
|
||||||
|
"wmt19-de-en-6-6-big": [0, 39.90],
|
||||||
|
}
|
||||||
|
pair = f"{src_lang}-{tgt_lang}"
|
||||||
|
|
||||||
|
readme = f"""
|
||||||
|
---
|
||||||
|
|
||||||
|
language: {src_lang}, {tgt_lang}
|
||||||
|
thumbnail:
|
||||||
|
tags:
|
||||||
|
- translation
|
||||||
|
- wmt19
|
||||||
|
- allenai
|
||||||
|
license: Apache 2.0
|
||||||
|
datasets:
|
||||||
|
- http://www.statmt.org/wmt19/ ([test-set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561))
|
||||||
|
metrics:
|
||||||
|
- http://www.statmt.org/wmt19/metrics-task.html
|
||||||
|
---
|
||||||
|
|
||||||
|
# FSMT
|
||||||
|
|
||||||
|
## Model description
|
||||||
|
|
||||||
|
This is a ported version of fairseq-based wmt19 transformer created by [jungokasai]](https://github.com/jungokasai/) @ allenai for {src_lang}-{tgt_lang}.
|
||||||
|
|
||||||
|
2 models are available:
|
||||||
|
|
||||||
|
* [wmt19-de-en-6-6-big](https://huggingface.co/allenai/wmt19-de-en-6-6-big)
|
||||||
|
* [wmt19-de-en-6-6-base](https://huggingface.co/allenai/wmt19-de-en-6-6-base)
|
||||||
|
|
||||||
|
## Intended uses & limitations
|
||||||
|
|
||||||
|
#### How to use
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers.tokenization_fsmt import FSMTTokenizer
|
||||||
|
from transformers.modeling_fsmt import FSMTForConditionalGeneration
|
||||||
|
mname = "allenai/{model_name}"
|
||||||
|
tokenizer = FSMTTokenizer.from_pretrained(mname)
|
||||||
|
model = FSMTForConditionalGeneration.from_pretrained(mname)
|
||||||
|
|
||||||
|
input = "{texts[src_lang]}"
|
||||||
|
input_ids = tokenizer.encode(input, return_tensors="pt")
|
||||||
|
outputs = model.generate(input_ids)
|
||||||
|
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||||
|
print(decoded) # {texts[tgt_lang]}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Limitations and bias
|
||||||
|
|
||||||
|
|
||||||
|
## Training data
|
||||||
|
|
||||||
|
Pretrained weights were left identical to the original model released by the researcher.
|
||||||
|
|
||||||
|
## Eval results
|
||||||
|
|
||||||
|
Here are the BLEU scores:
|
||||||
|
|
||||||
|
model | transformers
|
||||||
|
-------|---------|----------
|
||||||
|
{model_name} | {scores[model_name][1]}
|
||||||
|
|
||||||
|
The score was calculated using this code:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/huggingface/transformers
|
||||||
|
cd transformers
|
||||||
|
export PAIR={pair}
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=8
|
||||||
|
export NUM_BEAMS=5
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
echo $PAIR
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
```
|
||||||
|
|
||||||
|
"""
|
||||||
|
model_card_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
path = os.path.join(model_card_dir, "README.md")
|
||||||
|
print(f"Generating {path}")
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(readme)
|
||||||
|
|
||||||
|
# make sure we are under the root of the project
|
||||||
|
repo_dir = Path(__file__).resolve().parent.parent.parent
|
||||||
|
model_cards_dir = repo_dir / "model_cards"
|
||||||
|
|
||||||
|
for model_name in ["wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big"]:
|
||||||
|
model_card_dir = model_cards_dir / "allenai" / model_name
|
||||||
|
write_model_card(model_card_dir, src_lang="de", tgt_lang="en", model_name=model_name)
|
||||||
135
scripts/fsmt/gen-card-facebook-wmt19.py
Executable file
135
scripts/fsmt/gen-card-facebook-wmt19.py
Executable file
@@ -0,0 +1,135 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Usage:
|
||||||
|
# ./gen-card-facebook-wmt19.py
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def write_model_card(model_card_dir, src_lang, tgt_lang):
|
||||||
|
|
||||||
|
texts = {
|
||||||
|
"en": "Machine learning is great, isn't it?",
|
||||||
|
"ru": "Машинное обучение - это здорово, не так ли?",
|
||||||
|
"de": "Maschinelles Lernen ist großartig, oder?",
|
||||||
|
}
|
||||||
|
|
||||||
|
# BLUE scores as follows:
|
||||||
|
# "pair": [fairseq, transformers]
|
||||||
|
scores = {
|
||||||
|
"ru-en": ["[41.3](http://matrix.statmt.org/matrix/output/1907?run_id=6937)", "39.20"],
|
||||||
|
"en-ru": ["[36.4](http://matrix.statmt.org/matrix/output/1914?run_id=6724)", "33.47"],
|
||||||
|
"en-de": ["[43.1](http://matrix.statmt.org/matrix/output/1909?run_id=6862)", "42.83"],
|
||||||
|
"de-en": ["[42.3](http://matrix.statmt.org/matrix/output/1902?run_id=6750)", "41.35"],
|
||||||
|
}
|
||||||
|
pair = f"{src_lang}-{tgt_lang}"
|
||||||
|
|
||||||
|
readme = f"""
|
||||||
|
---
|
||||||
|
|
||||||
|
<!-- This file has been auto-generated by src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py - DO NOT EDIT or your changes will be lost -->
|
||||||
|
|
||||||
|
language: {src_lang}, {tgt_lang}
|
||||||
|
thumbnail:
|
||||||
|
tags:
|
||||||
|
- translation
|
||||||
|
- wmt19
|
||||||
|
license: Apache 2.0
|
||||||
|
datasets:
|
||||||
|
- http://www.statmt.org/wmt19/ ([test-set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561))
|
||||||
|
metrics:
|
||||||
|
- http://www.statmt.org/wmt19/metrics-task.html
|
||||||
|
---
|
||||||
|
|
||||||
|
# FSMT
|
||||||
|
|
||||||
|
## Model description
|
||||||
|
|
||||||
|
This is a ported version of [fairseq wmt19 transformer](https://github.com/pytorch/fairseq/blob/master/examples/wmt19/README.md) for {src_lang}-{tgt_lang}.
|
||||||
|
|
||||||
|
For more details, please see, [Facebook FAIR's WMT19 News Translation Task Submission](https://arxiv.org/abs/1907.06616).
|
||||||
|
|
||||||
|
The abbreviation FSMT stands for FairSeqMachineTranslation
|
||||||
|
|
||||||
|
All four models are available:
|
||||||
|
|
||||||
|
* [wmt19-en-ru](https://huggingface.co/facebook/wmt19-en-ru)
|
||||||
|
* [wmt19-ru-en](https://huggingface.co/facebook/wmt19-ru-en)
|
||||||
|
* [wmt19-en-de](https://huggingface.co/facebook/wmt19-en-de)
|
||||||
|
* [wmt19-de-en](https://huggingface.co/facebook/wmt19-de-en)
|
||||||
|
|
||||||
|
## Intended uses & limitations
|
||||||
|
|
||||||
|
#### How to use
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers.tokenization_fsmt import FSMTTokenizer
|
||||||
|
from transformers.modeling_fsmt import FSMTForConditionalGeneration
|
||||||
|
mname = "facebook/wmt19-{src_lang}-{tgt_lang}"
|
||||||
|
tokenizer = FSMTTokenizer.from_pretrained(mname)
|
||||||
|
model = FSMTForConditionalGeneration.from_pretrained(mname)
|
||||||
|
|
||||||
|
input = "{texts[src_lang]}
|
||||||
|
input_ids = tokenizer.encode(input, return_tensors="pt")
|
||||||
|
outputs = model.generate(input_ids)
|
||||||
|
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||||
|
print(decoded) # {texts[tgt_lang]}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Limitations and bias
|
||||||
|
|
||||||
|
- The original (and this ported model) doesn't seem to handle well inputs with repeated sub-phrases, [content gets truncated](https://discuss.huggingface.co/t/issues-with-translating-inputs-containing-repeated-phrases/981)
|
||||||
|
|
||||||
|
## Training data
|
||||||
|
|
||||||
|
Pretrained weights were left identical to the original model released by fairseq. For more details, please, see the [paper](https://arxiv.org/abs/1907.06616).
|
||||||
|
|
||||||
|
## Eval results
|
||||||
|
|
||||||
|
pair | fairseq | transformers
|
||||||
|
-------|---------|----------
|
||||||
|
{pair} | {scores[pair][0]} | {scores[pair][1]}
|
||||||
|
|
||||||
|
The score is slightly below the score reported by `fairseq`, since `transformers`` currently doesn't support:
|
||||||
|
- model ensemble, therefore the best performing checkpoint was ported (``model4.pt``).
|
||||||
|
- re-ranking
|
||||||
|
|
||||||
|
The score was calculated using this code:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/huggingface/transformers
|
||||||
|
cd transformers
|
||||||
|
export PAIR={pair}
|
||||||
|
export DATA_DIR=data/$PAIR
|
||||||
|
export SAVE_DIR=data/$PAIR
|
||||||
|
export BS=8
|
||||||
|
export NUM_BEAMS=15
|
||||||
|
mkdir -p $DATA_DIR
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||||
|
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||||
|
echo $PAIR
|
||||||
|
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||||
|
```
|
||||||
|
note: fairseq reports using a beam of 50, so you should get a slightly higher score if re-run with `--num_beams 50`.
|
||||||
|
|
||||||
|
|
||||||
|
## TODO
|
||||||
|
|
||||||
|
- port model ensemble (fairseq uses 4 model checkpoints)
|
||||||
|
|
||||||
|
"""
|
||||||
|
os.makedirs(model_card_dir, exist_ok=True)
|
||||||
|
path = os.path.join(model_card_dir, "README.md")
|
||||||
|
print(f"Generating {path}")
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(readme)
|
||||||
|
|
||||||
|
# make sure we are under the root of the project
|
||||||
|
repo_dir = Path(__file__).resolve().parent.parent.parent
|
||||||
|
model_cards_dir = repo_dir / "model_cards"
|
||||||
|
|
||||||
|
for model_name in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]:
|
||||||
|
base, src_lang, tgt_lang = model_name.split("-")
|
||||||
|
model_card_dir = model_cards_dir / "facebook" / model_name
|
||||||
|
write_model_card(model_card_dir, src_lang=src_lang, tgt_lang=tgt_lang)
|
||||||
Reference in New Issue
Block a user