From 11bbb505c77a1d29370cf16a964cfe73b7a76340 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Wed, 13 Mar 2024 14:53:27 +0100 Subject: [PATCH] Adds pretrained IDs directly in the tests (#29534) * Adds pretrained IDs directly in the tests * Fix tests * Fix tests * Review! --- tests/models/albert/test_tokenization_albert.py | 1 + tests/models/bart/test_tokenization_bart.py | 1 + tests/models/barthez/test_tokenization_barthez.py | 1 + tests/models/bartpho/test_tokenization_bartpho.py | 1 + tests/models/bert/test_tokenization_bert.py | 1 + .../test_tokenization_bert_generation.py | 1 + .../bert_japanese/test_tokenization_bert_japanese.py | 2 ++ tests/models/bertweet/test_tokenization_bertweet.py | 1 + tests/models/big_bird/test_tokenization_big_bird.py | 1 + tests/models/biogpt/test_tokenization_biogpt.py | 1 + .../test_tokenization_blenderbot_small.py | 1 + tests/models/bloom/test_tokenization_bloom.py | 1 + tests/models/camembert/test_tokenization_camembert.py | 1 + tests/models/canine/test_tokenization_canine.py | 1 + tests/models/clip/test_tokenization_clip.py | 1 + tests/models/clvp/test_tokenization_clvp.py | 1 + .../models/code_llama/test_tokenization_code_llama.py | 1 + tests/models/codegen/test_tokenization_codegen.py | 1 + tests/models/cpmant/test_tokenization_cpmant.py | 1 + tests/models/ctrl/test_tokenization_ctrl.py | 1 + tests/models/deberta/test_tokenization_deberta.py | 1 + .../models/deberta_v2/test_tokenization_deberta_v2.py | 1 + .../models/distilbert/test_tokenization_distilbert.py | 1 + tests/models/dpr/test_tokenization_dpr.py | 3 +++ tests/models/electra/test_tokenization_electra.py | 1 + tests/models/ernie_m/test_tokenization_ernie_m.py | 1 + .../test_tokenization_fastspeech2_conformer.py | 1 + tests/models/fnet/test_tokenization_fnet.py | 1 + tests/models/fsmt/test_tokenization_fsmt.py | 1 + tests/models/funnel/test_tokenization_funnel.py | 1 + tests/models/gemma/test_tokenization_gemma.py | 1 + tests/models/gpt2/test_tokenization_gpt2.py | 1 + .../test_tokenization_gpt_neox_japanese.py | 1 + tests/models/gpt_sw3/test_tokenization_gpt_sw3.py | 1 + .../test_tokenization_gptsan_japanese.py | 1 + tests/models/herbert/test_tokenization_herbert.py | 1 + tests/models/layoutlm/test_tokenization_layoutlm.py | 1 + .../models/layoutlmv2/test_tokenization_layoutlmv2.py | 1 + .../models/layoutlmv3/test_tokenization_layoutlmv3.py | 1 + tests/models/layoutxlm/test_tokenization_layoutxlm.py | 1 + tests/models/led/test_tokenization_led.py | 1 + tests/models/llama/test_tokenization_llama.py | 1 + .../models/longformer/test_tokenization_longformer.py | 1 + tests/models/luke/test_tokenization_luke.py | 1 + tests/models/lxmert/test_tokenization_lxmert.py | 1 + tests/models/m2m_100/test_tokenization_m2m_100.py | 1 + tests/models/marian/test_tokenization_marian.py | 1 + tests/models/markuplm/test_tokenization_markuplm.py | 1 + tests/models/mbart/test_tokenization_mbart.py | 1 + tests/models/mbart50/test_tokenization_mbart50.py | 1 + tests/models/mgp_str/test_tokenization_mgp_str.py | 1 + tests/models/mluke/test_tokenization_mluke.py | 1 + .../models/mobilebert/test_tokenization_mobilebert.py | 1 + tests/models/mpnet/test_tokenization_mpnet.py | 1 + tests/models/mvp/test_tokenization_mvp.py | 1 + tests/models/nllb/test_tokenization_nllb.py | 1 + tests/models/nougat/test_tokenization_nougat.py | 1 + tests/models/openai/test_tokenization_openai.py | 1 + tests/models/pegasus/test_tokenization_pegasus.py | 2 ++ tests/models/perceiver/test_tokenization_perceiver.py | 1 + tests/models/phobert/test_tokenization_phobert.py | 1 + tests/models/plbart/test_tokenization_plbart.py | 1 + .../models/prophetnet/test_tokenization_prophetnet.py | 1 + tests/models/qwen2/test_tokenization_qwen2.py | 1 + tests/models/realm/test_tokenization_realm.py | 1 + tests/models/reformer/test_tokenization_reformer.py | 1 + tests/models/rembert/test_tokenization_rembert.py | 1 + tests/models/roberta/test_tokenization_roberta.py | 1 + tests/models/roc_bert/test_tokenization_roc_bert.py | 1 + tests/models/roformer/test_tokenization_roformer.py | 1 + .../seamless_m4t/test_tokenization_seamless_m4t.py | 1 + tests/models/siglip/test_tokenization_siglip.py | 1 + .../test_tokenization_speech_to_text.py | 1 + .../test_tokenization_speech_to_text_2.py | 1 + tests/models/speecht5/test_tokenization_speecht5.py | 1 + .../squeezebert/test_tokenization_squeezebert.py | 1 + tests/models/t5/test_tokenization_t5.py | 1 + tests/models/tapas/test_tokenization_tapas.py | 1 + tests/models/udop/test_tokenization_udop.py | 1 + tests/models/vits/test_tokenization_vits.py | 1 + tests/models/wav2vec2/test_tokenization_wav2vec2.py | 1 + .../test_tokenization_wav2vec2_phoneme.py | 1 + tests/models/whisper/test_tokenization_whisper.py | 1 + tests/models/xglm/test_tokenization_xglm.py | 1 + tests/models/xlm/test_tokenization_xlm.py | 1 + .../test_tokenization_xlm_prophetnet.py | 1 + .../xlm_roberta/test_tokenization_xlm_roberta.py | 1 + tests/models/xlnet/test_tokenization_xlnet.py | 1 + tests/test_tokenization_common.py | 11 +++-------- 89 files changed, 95 insertions(+), 8 deletions(-) diff --git a/tests/models/albert/test_tokenization_albert.py b/tests/models/albert/test_tokenization_albert.py index 343cba168f..e3f39257a6 100644 --- a/tests/models/albert/test_tokenization_albert.py +++ b/tests/models/albert/test_tokenization_albert.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") @require_sentencepiece @require_tokenizers class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "albert/albert-base-v1" tokenizer_class = AlbertTokenizer rust_tokenizer_class = AlbertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py index 746716161a..f3a63d6d41 100644 --- a/tests/models/bart/test_tokenization_bart.py +++ b/tests/models/bart/test_tokenization_bart.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_det @require_tokenizers class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/bart-base" tokenizer_class = BartTokenizer rust_tokenizer_class = BartTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/barthez/test_tokenization_barthez.py b/tests/models/barthez/test_tokenization_barthez.py index 7759d3560d..b2b0c7b058 100644 --- a/tests/models/barthez/test_tokenization_barthez.py +++ b/tests/models/barthez/test_tokenization_barthez.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_sentencepiece @slow # see https://github.com/huggingface/transformers/issues/11457 class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "moussaKam/mbarthez" tokenizer_class = BarthezTokenizer rust_tokenizer_class = BarthezTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/bartpho/test_tokenization_bartpho.py b/tests/models/bartpho/test_tokenization_bartpho.py index 1fc06e38e0..023584e91f 100644 --- a/tests/models/bartpho/test_tokenization_bartpho.py +++ b/tests/models/bartpho/test_tokenization_bartpho.py @@ -26,6 +26,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model") class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "vinai/bartpho-syllable" tokenizer_class = BartphoTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py index bee1ccf0d1..cf3cc1dce1 100644 --- a/tests/models/bert/test_tokenization_bert.py +++ b/tests/models/bert/test_tokenization_bert.py @@ -34,6 +34,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english @require_tokenizers class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google-bert/bert-base-uncased" tokenizer_class = BertTokenizer rust_tokenizer_class = BertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/bert_generation/test_tokenization_bert_generation.py b/tests/models/bert_generation/test_tokenization_bert_generation.py index 41d9928835..e1ccfba8f4 100644 --- a/tests/models/bert_generation/test_tokenization_bert_generation.py +++ b/tests/models/bert_generation/test_tokenization_bert_generation.py @@ -29,6 +29,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/bert_for_seq_generation_L-24_bbc_encoder" tokenizer_class = BertGenerationTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py index d2a7accb39..d4954c9652 100644 --- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py +++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py @@ -36,6 +36,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @custom_tokenizers class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "cl-tohoku/bert-base-japanese" tokenizer_class = BertJapaneseTokenizer test_rust_tokenizer = False space_between_special_tokens = True @@ -403,6 +404,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @custom_tokenizers class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "cl-tohoku/bert-base-japanese" tokenizer_class = BertJapaneseTokenizer test_rust_tokenizer = False diff --git a/tests/models/bertweet/test_tokenization_bertweet.py b/tests/models/bertweet/test_tokenization_bertweet.py index 2a4c643269..71e0a0afe5 100644 --- a/tests/models/bertweet/test_tokenization_bertweet.py +++ b/tests/models/bertweet/test_tokenization_bertweet.py @@ -22,6 +22,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "vinai/bertweet-base" tokenizer_class = BertweetTokenizer test_rust_tokenizer = False diff --git a/tests/models/big_bird/test_tokenization_big_bird.py b/tests/models/big_bird/test_tokenization_big_bird.py index 23b25e4029..863d30e849 100644 --- a/tests/models/big_bird/test_tokenization_big_bird.py +++ b/tests/models/big_bird/test_tokenization_big_bird.py @@ -30,6 +30,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/bigbird-roberta-base" tokenizer_class = BigBirdTokenizer rust_tokenizer_class = BigBirdTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/biogpt/test_tokenization_biogpt.py b/tests/models/biogpt/test_tokenization_biogpt.py index c350f5de0e..ea52a7cf7f 100644 --- a/tests/models/biogpt/test_tokenization_biogpt.py +++ b/tests/models/biogpt/test_tokenization_biogpt.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_sacremoses class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/biogpt" tokenizer_class = BioGptTokenizer test_rust_tokenizer = False diff --git a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py index b022e77682..369dde6739 100644 --- a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py +++ b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/blenderbot_small-90M" tokenizer_class = BlenderbotSmallTokenizer test_rust_tokenizer = False diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py index 02491929d1..4fbfcb8923 100644 --- a/tests/models/bloom/test_tokenization_bloom.py +++ b/tests/models/bloom/test_tokenization_bloom.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "bigscience/tokenizer" slow_tokenizer_class = None rust_tokenizer_class = BloomTokenizerFast tokenizer_class = BloomTokenizerFast diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py index 33254b96de..624338b7f0 100644 --- a/tests/models/camembert/test_tokenization_camembert.py +++ b/tests/models/camembert/test_tokenization_camembert.py @@ -32,6 +32,7 @@ FRAMEWORK = "pt" if is_torch_available() else "tf" @require_sentencepiece @require_tokenizers class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "almanach/camembert-base" tokenizer_class = CamembertTokenizer rust_tokenizer_class = CamembertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py index da4665e4cf..eb3e6d9b4a 100644 --- a/tests/models/canine/test_tokenization_canine.py +++ b/tests/models/canine/test_tokenization_canine.py @@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "nielsr/canine-s" tokenizer_class = CanineTokenizer test_rust_tokenizer = False diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py index 4f1d9a73ef..ec1cbd08ac 100644 --- a/tests/models/clip/test_tokenization_clip.py +++ b/tests/models/clip/test_tokenization_clip.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openai/clip-vit-base-patch32" tokenizer_class = CLIPTokenizer rust_tokenizer_class = CLIPTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/clvp/test_tokenization_clvp.py b/tests/models/clvp/test_tokenization_clvp.py index b636888759..7bb522f414 100644 --- a/tests/models/clvp/test_tokenization_clvp.py +++ b/tests/models/clvp/test_tokenization_clvp.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, slow class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "susnato/clvp_dev" tokenizer_class = ClvpTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"add_prefix_space": True} diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py index d39454b0fa..2a71ded72a 100644 --- a/tests/models/code_llama/test_tokenization_code_llama.py +++ b/tests/models/code_llama/test_tokenization_code_llama.py @@ -51,6 +51,7 @@ if is_torch_available(): @require_sentencepiece @require_tokenizers class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "hf-internal-testing/llama-code-tokenizer" tokenizer_class = CodeLlamaTokenizer rust_tokenizer_class = CodeLlamaTokenizerFast test_rust_tokenizer = False diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py index edffbeaec9..025ed99b9a 100644 --- a/tests/models/codegen/test_tokenization_codegen.py +++ b/tests/models/codegen/test_tokenization_codegen.py @@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "Salesforce/codegen-350M-mono" tokenizer_class = CodeGenTokenizer rust_tokenizer_class = CodeGenTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/cpmant/test_tokenization_cpmant.py b/tests/models/cpmant/test_tokenization_cpmant.py index f5d0ef3245..042473065b 100644 --- a/tests/models/cpmant/test_tokenization_cpmant.py +++ b/tests/models/cpmant/test_tokenization_cpmant.py @@ -24,6 +24,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_jieba class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openbmb/cpm-ant-10b" tokenizer_class = CpmAntTokenizer test_rust_tokenizer = False diff --git a/tests/models/ctrl/test_tokenization_ctrl.py b/tests/models/ctrl/test_tokenization_ctrl.py index 02c3459f9e..7fe61f3607 100644 --- a/tests/models/ctrl/test_tokenization_ctrl.py +++ b/tests/models/ctrl/test_tokenization_ctrl.py @@ -23,6 +23,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "Salesforce/ctrl" tokenizer_class = CTRLTokenizer test_rust_tokenizer = False test_seq2seq = False diff --git a/tests/models/deberta/test_tokenization_deberta.py b/tests/models/deberta/test_tokenization_deberta.py index 81d7bd95bd..96248cf2ec 100644 --- a/tests/models/deberta/test_tokenization_deberta.py +++ b/tests/models/deberta/test_tokenization_deberta.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/deberta-base" tokenizer_class = DebertaTokenizer test_rust_tokenizer = True rust_tokenizer_class = DebertaTokenizerFast diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py index c75f45bfe8..55f7e8b542 100644 --- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py +++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") @require_sentencepiece @require_tokenizers class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/deberta-v2-xlarge" tokenizer_class = DebertaV2Tokenizer rust_tokenizer_class = DebertaV2TokenizerFast test_sentencepiece = True diff --git a/tests/models/distilbert/test_tokenization_distilbert.py b/tests/models/distilbert/test_tokenization_distilbert.py index 0942239572..c61393f6a6 100644 --- a/tests/models/distilbert/test_tokenization_distilbert.py +++ b/tests/models/distilbert/test_tokenization_distilbert.py @@ -25,6 +25,7 @@ class DistilBertTokenizationTest(BertTokenizationTest): tokenizer_class = DistilBertTokenizer rust_tokenizer_class = DistilBertTokenizerFast test_rust_tokenizer = True + from_pretrained_id = "distilbert/distilbert-base-uncased" @slow def test_sequence_builders(self): diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py index 2e0f41da4d..1fd3d8bdb9 100644 --- a/tests/models/dpr/test_tokenization_dpr.py +++ b/tests/models/dpr/test_tokenization_dpr.py @@ -33,6 +33,7 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest): tokenizer_class = DPRContextEncoderTokenizer rust_tokenizer_class = DPRContextEncoderTokenizerFast test_rust_tokenizer = True + from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base" @require_tokenizers @@ -40,6 +41,7 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): tokenizer_class = DPRQuestionEncoderTokenizer rust_tokenizer_class = DPRQuestionEncoderTokenizerFast test_rust_tokenizer = True + from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base" @require_tokenizers @@ -47,6 +49,7 @@ class DPRReaderTokenizationTest(BertTokenizationTest): tokenizer_class = DPRReaderTokenizer rust_tokenizer_class = DPRReaderTokenizerFast test_rust_tokenizer = True + from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base" @slow def test_decode_best_spans(self): diff --git a/tests/models/electra/test_tokenization_electra.py b/tests/models/electra/test_tokenization_electra.py index 1c9b517f1f..64611cb09c 100644 --- a/tests/models/electra/test_tokenization_electra.py +++ b/tests/models/electra/test_tokenization_electra.py @@ -33,6 +33,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english @require_tokenizers class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/electra-small-generator" tokenizer_class = ElectraTokenizer rust_tokenizer_class = ElectraTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/ernie_m/test_tokenization_ernie_m.py b/tests/models/ernie_m/test_tokenization_ernie_m.py index 19f144df45..5cc5ec6991 100644 --- a/tests/models/ernie_m/test_tokenization_ernie_m.py +++ b/tests/models/ernie_m/test_tokenization_ernie_m.py @@ -28,6 +28,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") @require_sentencepiece @require_tokenizers class ErnieMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "susnato/ernie-m-base_pytorch" tokenizer_class = ErnieMTokenizer test_seq2seq = False test_sentencepiece = True diff --git a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py index 0c0e269617..119e35555a 100644 --- a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py +++ b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py @@ -24,6 +24,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_g2p_en class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "espnet/fastspeech2_conformer" tokenizer_class = FastSpeech2ConformerTokenizer test_rust_tokenizer = False diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py index 85080efc3e..a3492cf966 100644 --- a/tests/models/fnet/test_tokenization_fnet.py +++ b/tests/models/fnet/test_tokenization_fnet.py @@ -28,6 +28,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") @require_sentencepiece @require_tokenizers class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/fnet-base" tokenizer_class = FNetTokenizer rust_tokenizer_class = FNetTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/fsmt/test_tokenization_fsmt.py b/tests/models/fsmt/test_tokenization_fsmt.py index 7407c2fbc8..4be15cbee1 100644 --- a/tests/models/fsmt/test_tokenization_fsmt.py +++ b/tests/models/fsmt/test_tokenization_fsmt.py @@ -30,6 +30,7 @@ FSMT_TINY2 = "stas/tiny-wmt19-en-ru" class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "stas/tiny-wmt19-en-de" tokenizer_class = FSMTTokenizer test_rust_tokenizer = False diff --git a/tests/models/funnel/test_tokenization_funnel.py b/tests/models/funnel/test_tokenization_funnel.py index 6c5eb87db1..7628582e9f 100644 --- a/tests/models/funnel/test_tokenization_funnel.py +++ b/tests/models/funnel/test_tokenization_funnel.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "funnel-transformer/small" tokenizer_class = FunnelTokenizer rust_tokenizer_class = FunnelTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py index a16d471a24..5e485da491 100644 --- a/tests/models/gemma/test_tokenization_gemma.py +++ b/tests/models/gemma/test_tokenization_gemma.py @@ -49,6 +49,7 @@ if is_torch_available(): @require_sentencepiece @require_tokenizers class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/gemma-7b" tokenizer_class = GemmaTokenizer rust_tokenizer_class = GemmaTokenizerFast diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py index 78906e3db3..1e7c81e4be 100644 --- a/tests/models/gpt2/test_tokenization_gpt2.py +++ b/tests/models/gpt2/test_tokenization_gpt2.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openai-community/gpt2" tokenizer_class = GPT2Tokenizer rust_tokenizer_class = GPT2TokenizerFast test_rust_tokenizer = True diff --git a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py index 293116a24e..ec505da4a0 100644 --- a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py +++ b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py @@ -29,6 +29,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "abeja/gpt-neox-japanese-2.7b" tokenizer_class = GPTNeoXJapaneseTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False} diff --git a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py index 7bbaf74882..ae9526342c 100644 --- a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py +++ b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_with_bytefallback.mode @require_sentencepiece @require_tokenizers class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "AI-Sweden-Models/gpt-sw3-126m" tokenizer_class = GPTSw3Tokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py index 6d656b2d0f..8d989a51a7 100644 --- a/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py +++ b/tests/models/gptsan_japanese/test_tokenization_gptsan_japanese.py @@ -29,6 +29,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class GPTSanJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "Tanrei/GPTSAN-japanese" tokenizer_class = GPTSanJapaneseTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False} diff --git a/tests/models/herbert/test_tokenization_herbert.py b/tests/models/herbert/test_tokenization_herbert.py index d035348b73..b8bbd77758 100644 --- a/tests/models/herbert/test_tokenization_herbert.py +++ b/tests/models/herbert/test_tokenization_herbert.py @@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_sacremoses @require_tokenizers class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "allegro/herbert-base-cased" tokenizer_class = HerbertTokenizer rust_tokenizer_class = HerbertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/layoutlm/test_tokenization_layoutlm.py b/tests/models/layoutlm/test_tokenization_layoutlm.py index b73b2aa8e4..a34811a90b 100644 --- a/tests/models/layoutlm/test_tokenization_layoutlm.py +++ b/tests/models/layoutlm/test_tokenization_layoutlm.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/layoutlm-base-uncased" tokenizer_class = LayoutLMTokenizer rust_tokenizer_class = LayoutLMTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py index 3360933be6..61fafa23da 100644 --- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py @@ -61,6 +61,7 @@ logger = logging.get_logger(__name__) @require_tokenizers @require_pandas class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/layoutlmv2-base-uncased" tokenizer_class = LayoutLMv2Tokenizer rust_tokenizer_class = LayoutLMv2TokenizerFast test_rust_tokenizer = True diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index db02dc65d6..c7af4fbddc 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -49,6 +49,7 @@ logger = logging.get_logger(__name__) @require_tokenizers @require_pandas class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/layoutlmv3-base" tokenizer_class = LayoutLMv3Tokenizer rust_tokenizer_class = LayoutLMv3TokenizerFast test_rust_tokenizer = True diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py index 086bbc6ba0..474cc53171 100644 --- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py +++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py @@ -54,6 +54,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_tokenizers @require_pandas class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "FacebookAI/xlm-roberta-base" tokenizer_class = LayoutXLMTokenizer rust_tokenizer_class = LayoutXLMTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/led/test_tokenization_led.py b/tests/models/led/test_tokenization_led.py index 7ff8174994..f287677a12 100644 --- a/tests/models/led/test_tokenization_led.py +++ b/tests/models/led/test_tokenization_led.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "allenai/led-base-16384" tokenizer_class = LEDTokenizer rust_tokenizer_class = LEDTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index f3674a83b0..e99104f7f6 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -52,6 +52,7 @@ if is_torch_available(): @require_sentencepiece @require_tokenizers class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "hf-internal-testing/llama-tokenizer" tokenizer_class = LlamaTokenizer rust_tokenizer_class = LlamaTokenizerFast diff --git a/tests/models/longformer/test_tokenization_longformer.py b/tests/models/longformer/test_tokenization_longformer.py index 42524ca65a..1d1eda3380 100644 --- a/tests/models/longformer/test_tokenization_longformer.py +++ b/tests/models/longformer/test_tokenization_longformer.py @@ -30,6 +30,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest with FacebookAI/roberta-base->allenai/longformer-base-4096,Roberta->Longformer,roberta->longformer, class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "allenai/longformer-base-4096" # Ignore copy tokenizer_class = LongformerTokenizer test_slow_tokenizer = True diff --git a/tests/models/luke/test_tokenization_luke.py b/tests/models/luke/test_tokenization_luke.py index 26797faf77..0e5d912315 100644 --- a/tests/models/luke/test_tokenization_luke.py +++ b/tests/models/luke/test_tokenization_luke.py @@ -28,6 +28,7 @@ SAMPLE_ENTITY_VOCAB = get_tests_dir("fixtures/test_entity_vocab.json") class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "studio-ousia/luke-base" tokenizer_class = LukeTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"cls_token": ""} diff --git a/tests/models/lxmert/test_tokenization_lxmert.py b/tests/models/lxmert/test_tokenization_lxmert.py index e094427f76..716e3b971f 100644 --- a/tests/models/lxmert/test_tokenization_lxmert.py +++ b/tests/models/lxmert/test_tokenization_lxmert.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "unc-nlp/lxmert-base-uncased" tokenizer_class = LxmertTokenizer rust_tokenizer_class = LxmertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py index 50087a7d9d..ced6cf13de 100644 --- a/tests/models/m2m_100/test_tokenization_m2m_100.py +++ b/tests/models/m2m_100/test_tokenization_m2m_100.py @@ -48,6 +48,7 @@ FR_CODE = 128028 @require_sentencepiece class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/m2m100_418M" tokenizer_class = M2M100Tokenizer test_rust_tokenizer = False test_seq2seq = False diff --git a/tests/models/marian/test_tokenization_marian.py b/tests/models/marian/test_tokenization_marian.py index 6fb3c9a85d..3ef85e24de 100644 --- a/tests/models/marian/test_tokenization_marian.py +++ b/tests/models/marian/test_tokenization_marian.py @@ -45,6 +45,7 @@ else: @require_sentencepiece class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "Helsinki-NLP/opus-mt-en-de" tokenizer_class = MarianTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py index e793a9a507..df1b9ed083 100644 --- a/tests/models/markuplm/test_tokenization_markuplm.py +++ b/tests/models/markuplm/test_tokenization_markuplm.py @@ -41,6 +41,7 @@ logger = logging.get_logger(__name__) @require_tokenizers class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/markuplm-base" tokenizer_class = MarkupLMTokenizer rust_tokenizer_class = MarkupLMTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/mbart/test_tokenization_mbart.py b/tests/models/mbart/test_tokenization_mbart.py index e5a9d8c07f..635be07aa1 100644 --- a/tests/models/mbart/test_tokenization_mbart.py +++ b/tests/models/mbart/test_tokenization_mbart.py @@ -41,6 +41,7 @@ RO_CODE = 250020 @require_sentencepiece @require_tokenizers class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/mbart-large-en-ro" tokenizer_class = MBartTokenizer rust_tokenizer_class = MBartTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/mbart50/test_tokenization_mbart50.py b/tests/models/mbart50/test_tokenization_mbart50.py index a5ba802b6c..799cd8afc3 100644 --- a/tests/models/mbart50/test_tokenization_mbart50.py +++ b/tests/models/mbart50/test_tokenization_mbart50.py @@ -41,6 +41,7 @@ RO_CODE = 250020 @require_sentencepiece @require_tokenizers class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/mbart-large-50-one-to-many-mmt" tokenizer_class = MBart50Tokenizer rust_tokenizer_class = MBart50TokenizerFast test_rust_tokenizer = True diff --git a/tests/models/mgp_str/test_tokenization_mgp_str.py b/tests/models/mgp_str/test_tokenization_mgp_str.py index 0d0e6bb0bf..035d43cc43 100644 --- a/tests/models/mgp_str/test_tokenization_mgp_str.py +++ b/tests/models/mgp_str/test_tokenization_mgp_str.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "alibaba-damo/mgp-str-base" tokenizer_class = MgpstrTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {} diff --git a/tests/models/mluke/test_tokenization_mluke.py b/tests/models/mluke/test_tokenization_mluke.py index a466ae547c..0497fa849c 100644 --- a/tests/models/mluke/test_tokenization_mluke.py +++ b/tests/models/mluke/test_tokenization_mluke.py @@ -28,6 +28,7 @@ SAMPLE_ENTITY_VOCAB = get_tests_dir("fixtures/test_entity_vocab.json") class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "studio-ousia/mluke-base" tokenizer_class = MLukeTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"cls_token": ""} diff --git a/tests/models/mobilebert/test_tokenization_mobilebert.py b/tests/models/mobilebert/test_tokenization_mobilebert.py index 92ddd88684..4d5e09e08d 100644 --- a/tests/models/mobilebert/test_tokenization_mobilebert.py +++ b/tests/models/mobilebert/test_tokenization_mobilebert.py @@ -34,6 +34,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english @require_tokenizers class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "mobilebert-uncased" tokenizer_class = MobileBertTokenizer rust_tokenizer_class = MobileBertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/mpnet/test_tokenization_mpnet.py b/tests/models/mpnet/test_tokenization_mpnet.py index e30dd3a914..b63dc7ab64 100644 --- a/tests/models/mpnet/test_tokenization_mpnet.py +++ b/tests/models/mpnet/test_tokenization_mpnet.py @@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/mpnet-base" tokenizer_class = MPNetTokenizer rust_tokenizer_class = MPNetTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/mvp/test_tokenization_mvp.py b/tests/models/mvp/test_tokenization_mvp.py index 8bddb8443b..a442848d4d 100644 --- a/tests/models/mvp/test_tokenization_mvp.py +++ b/tests/models/mvp/test_tokenization_mvp.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_det @require_tokenizers class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "RUCAIBox/mvp" tokenizer_class = MvpTokenizer rust_tokenizer_class = MvpTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py index 4446522f9d..92134c3f8b 100644 --- a/tests/models/nllb/test_tokenization_nllb.py +++ b/tests/models/nllb/test_tokenization_nllb.py @@ -49,6 +49,7 @@ RO_CODE = 256145 @require_sentencepiece @require_tokenizers class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/nllb-200-distilled-600M" tokenizer_class = NllbTokenizer rust_tokenizer_class = NllbTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/nougat/test_tokenization_nougat.py b/tests/models/nougat/test_tokenization_nougat.py index bfb1090dad..088ce56f6e 100644 --- a/tests/models/nougat/test_tokenization_nougat.py +++ b/tests/models/nougat/test_tokenization_nougat.py @@ -24,6 +24,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class NougatTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/nougat-base" slow_tokenizer_class = None rust_tokenizer_class = NougatTokenizerFast tokenizer_class = NougatTokenizerFast diff --git a/tests/models/openai/test_tokenization_openai.py b/tests/models/openai/test_tokenization_openai.py index 2603063291..1f5ef5a35b 100644 --- a/tests/models/openai/test_tokenization_openai.py +++ b/tests/models/openai/test_tokenization_openai.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openai-community/openai-gpt" """Tests OpenAIGPTTokenizer that uses BERT BasicTokenizer.""" tokenizer_class = OpenAIGPTTokenizer diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py index 3abe36d118..66a68a97fc 100644 --- a/tests/models/pegasus/test_tokenization_pegasus.py +++ b/tests/models/pegasus/test_tokenization_pegasus.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model") @require_sentencepiece @require_tokenizers class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/pegasus-xsum" tokenizer_class = PegasusTokenizer rust_tokenizer_class = PegasusTokenizerFast test_rust_tokenizer = True @@ -135,6 +136,7 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @require_sentencepiece @require_tokenizers class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/pegasus-xsum" tokenizer_class = PegasusTokenizer rust_tokenizer_class = PegasusTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py index 6d9a9bd863..b5d149e5f2 100644 --- a/tests/models/perceiver/test_tokenization_perceiver.py +++ b/tests/models/perceiver/test_tokenization_perceiver.py @@ -36,6 +36,7 @@ else: class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "deepmind/language-perceiver" tokenizer_class = PerceiverTokenizer test_rust_tokenizer = False diff --git a/tests/models/phobert/test_tokenization_phobert.py b/tests/models/phobert/test_tokenization_phobert.py index 6624957531..bdf02d5f51 100644 --- a/tests/models/phobert/test_tokenization_phobert.py +++ b/tests/models/phobert/test_tokenization_phobert.py @@ -22,6 +22,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "vinai/phobert-base" tokenizer_class = PhobertTokenizer test_rust_tokenizer = False diff --git a/tests/models/plbart/test_tokenization_plbart.py b/tests/models/plbart/test_tokenization_plbart.py index f9cc38e0de..ff0ef386e3 100644 --- a/tests/models/plbart/test_tokenization_plbart.py +++ b/tests/models/plbart/test_tokenization_plbart.py @@ -40,6 +40,7 @@ PYTHON_CODE = 50002 @require_sentencepiece @require_tokenizers class PLBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "uclanlp/plbart-base" tokenizer_class = PLBartTokenizer rust_tokenizer_class = None test_rust_tokenizer = False diff --git a/tests/models/prophetnet/test_tokenization_prophetnet.py b/tests/models/prophetnet/test_tokenization_prophetnet.py index cf4317b3a6..09390db48b 100644 --- a/tests/models/prophetnet/test_tokenization_prophetnet.py +++ b/tests/models/prophetnet/test_tokenization_prophetnet.py @@ -32,6 +32,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/prophetnet-large-uncased" tokenizer_class = ProphetNetTokenizer test_rust_tokenizer = False diff --git a/tests/models/qwen2/test_tokenization_qwen2.py b/tests/models/qwen2/test_tokenization_qwen2.py index 49c62b5241..3193141b84 100644 --- a/tests/models/qwen2/test_tokenization_qwen2.py +++ b/tests/models/qwen2/test_tokenization_qwen2.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "qwen/qwen-tokenizer" tokenizer_class = Qwen2Tokenizer rust_tokenizer_class = Qwen2TokenizerFast test_slow_tokenizer = True diff --git a/tests/models/realm/test_tokenization_realm.py b/tests/models/realm/test_tokenization_realm.py index 7dbd8df6ef..f963fb347f 100644 --- a/tests/models/realm/test_tokenization_realm.py +++ b/tests/models/realm/test_tokenization_realm.py @@ -33,6 +33,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english @require_tokenizers class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/realm-cc-news-pretrained-embedder" tokenizer_class = RealmTokenizer rust_tokenizer_class = RealmTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/reformer/test_tokenization_reformer.py b/tests/models/reformer/test_tokenization_reformer.py index 0f72bf311a..ee9bd52f43 100644 --- a/tests/models/reformer/test_tokenization_reformer.py +++ b/tests/models/reformer/test_tokenization_reformer.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/reformer-crime-and-punishment" tokenizer_class = ReformerTokenizer rust_tokenizer_class = ReformerTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/rembert/test_tokenization_rembert.py b/tests/models/rembert/test_tokenization_rembert.py index 12c43643be..5f65629213 100644 --- a/tests/models/rembert/test_tokenization_rembert.py +++ b/tests/models/rembert/test_tokenization_rembert.py @@ -32,6 +32,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/rembert" tokenizer_class = RemBertTokenizer rust_tokenizer_class = RemBertTokenizerFast space_between_special_tokens = True diff --git a/tests/models/roberta/test_tokenization_roberta.py b/tests/models/roberta/test_tokenization_roberta.py index 5d457c4cb4..83f444d162 100644 --- a/tests/models/roberta/test_tokenization_roberta.py +++ b/tests/models/roberta/test_tokenization_roberta.py @@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_tokenizers class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "FacebookAI/roberta-base" tokenizer_class = RobertaTokenizer rust_tokenizer_class = RobertaTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/roc_bert/test_tokenization_roc_bert.py b/tests/models/roc_bert/test_tokenization_roc_bert.py index 6a24514b3c..4fb2b172d6 100644 --- a/tests/models/roc_bert/test_tokenization_roc_bert.py +++ b/tests/models/roc_bert/test_tokenization_roc_bert.py @@ -34,6 +34,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english @require_tokenizers class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "weiweishi/roc-bert-base-zh" tokenizer_class = RoCBertTokenizer rust_tokenizer_class = None test_rust_tokenizer = False diff --git a/tests/models/roformer/test_tokenization_roformer.py b/tests/models/roformer/test_tokenization_roformer.py index 3af411b6a8..c109608107 100644 --- a/tests/models/roformer/test_tokenization_roformer.py +++ b/tests/models/roformer/test_tokenization_roformer.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_rjieba @require_tokenizers class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "junnyu/roformer_chinese_small" tokenizer_class = RoFormerTokenizer rust_tokenizer_class = RoFormerTokenizerFast space_between_special_tokens = True diff --git a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py index c7d16796c4..2e65d01ead 100644 --- a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py @@ -53,6 +53,7 @@ SMALL_TRAINING_CORPUS = [ @require_sentencepiece @require_tokenizers class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/hf-seamless-m4t-medium" tokenizer_class = SeamlessM4TTokenizer rust_tokenizer_class = SeamlessM4TTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/siglip/test_tokenization_siglip.py b/tests/models/siglip/test_tokenization_siglip.py index 586d6b0089..fb3cb5b3f1 100644 --- a/tests/models/siglip/test_tokenization_siglip.py +++ b/tests/models/siglip/test_tokenization_siglip.py @@ -38,6 +38,7 @@ else: @require_sentencepiece @require_tokenizers class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/siglip-base-patch16-224" tokenizer_class = SiglipTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/speech_to_text/test_tokenization_speech_to_text.py b/tests/models/speech_to_text/test_tokenization_speech_to_text.py index b0cb1acc85..6bea58ddfc 100644 --- a/tests/models/speech_to_text/test_tokenization_speech_to_text.py +++ b/tests/models/speech_to_text/test_tokenization_speech_to_text.py @@ -37,6 +37,7 @@ ES_CODE = 10 @require_sentencepiece @require_tokenizers class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/s2t-small-librispeech-asr" tokenizer_class = Speech2TextTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py b/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py index 1000cce289..df433d67d9 100644 --- a/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py +++ b/tests/models/speech_to_text_2/test_tokenization_speech_to_text_2.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/s2t-wav2vec2-large-en-de" tokenizer_class = Speech2Text2Tokenizer test_rust_tokenizer = False diff --git a/tests/models/speecht5/test_tokenization_speecht5.py b/tests/models/speecht5/test_tokenization_speecht5.py index a8af8d274a..d007b14dd2 100644 --- a/tests/models/speecht5/test_tokenization_speecht5.py +++ b/tests/models/speecht5/test_tokenization_speecht5.py @@ -30,6 +30,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe_char.model") @require_sentencepiece @require_tokenizers class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/speecht5_asr" tokenizer_class = SpeechT5Tokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/squeezebert/test_tokenization_squeezebert.py b/tests/models/squeezebert/test_tokenization_squeezebert.py index a658625564..3ac24e8374 100644 --- a/tests/models/squeezebert/test_tokenization_squeezebert.py +++ b/tests/models/squeezebert/test_tokenization_squeezebert.py @@ -25,6 +25,7 @@ class SqueezeBertTokenizationTest(BertTokenizationTest): tokenizer_class = SqueezeBertTokenizer rust_tokenizer_class = SqueezeBertTokenizerFast test_rust_tokenizer = True + from_pretrained_id = "squeezebert/squeezebert-uncased" def get_rust_tokenizer(self, **kwargs): return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index b0755dc1ba..388388ff23 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -38,6 +38,7 @@ else: @require_sentencepiece @require_tokenizers class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google-t5/t5-small" tokenizer_class = T5Tokenizer rust_tokenizer_class = T5TokenizerFast test_rust_tokenizer = True diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py index 692dc91b6d..9aa3ad6d58 100644 --- a/tests/models/tapas/test_tokenization_tapas.py +++ b/tests/models/tapas/test_tokenization_tapas.py @@ -53,6 +53,7 @@ else: @require_tokenizers @require_pandas class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "google/tapas-large-finetuned-sqa" tokenizer_class = TapasTokenizer test_rust_tokenizer = False space_between_special_tokens = True diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py index cc9a2f2852..f9ad6c7abe 100644 --- a/tests/models/udop/test_tokenization_udop.py +++ b/tests/models/udop/test_tokenization_udop.py @@ -54,6 +54,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_tokenizers @require_pandas class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/udop-large" tokenizer_class = UdopTokenizer rust_tokenizer_class = UdopTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/vits/test_tokenization_vits.py b/tests/models/vits/test_tokenization_vits.py index c02caaaa90..fee6bac3a4 100644 --- a/tests/models/vits/test_tokenization_vits.py +++ b/tests/models/vits/test_tokenization_vits.py @@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/mms-tts-eng" tokenizer_class = VitsTokenizer test_rust_tokenizer = False diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py index 3ab8717d81..7310b14848 100644 --- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py +++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py @@ -367,6 +367,7 @@ class Wav2Vec2TokenizerTest(unittest.TestCase): class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/wav2vec2-base-960h" tokenizer_class = Wav2Vec2CTCTokenizer test_rust_tokenizer = False diff --git a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py index 0411a863bc..56e38f2cf5 100644 --- a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py +++ b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py @@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin @require_phonemizer class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/wav2vec2-lv-60-espeak-cv-ft" tokenizer_class = Wav2Vec2PhonemeCTCTokenizer test_rust_tokenizer = False diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py index 170857cffb..bb22a36f08 100644 --- a/tests/models/whisper/test_tokenization_whisper.py +++ b/tests/models/whisper/test_tokenization_whisper.py @@ -31,6 +31,7 @@ NOTIMESTAMPS = 50363 class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "openai/whisper-tiny" tokenizer_class = WhisperTokenizer rust_tokenizer_class = WhisperTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/xglm/test_tokenization_xglm.py b/tests/models/xglm/test_tokenization_xglm.py index 61674976a3..02c58681d1 100644 --- a/tests/models/xglm/test_tokenization_xglm.py +++ b/tests/models/xglm/test_tokenization_xglm.py @@ -31,6 +31,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/xglm-564M" tokenizer_class = XGLMTokenizer rust_tokenizer_class = XGLMTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/xlm/test_tokenization_xlm.py b/tests/models/xlm/test_tokenization_xlm.py index 4b5982ca98..6bc7fedad4 100644 --- a/tests/models/xlm/test_tokenization_xlm.py +++ b/tests/models/xlm/test_tokenization_xlm.py @@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "FacebookAI/xlm-mlm-en-2048" tokenizer_class = XLMTokenizer test_rust_tokenizer = False diff --git a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py index 679e808dc9..cadcc60049 100644 --- a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py +++ b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/xprophetnet-large-wiki100-cased" tokenizer_class = XLMProphetNetTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py index 6e2d4446a0..8c3674460d 100644 --- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py +++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py @@ -31,6 +31,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "FacebookAI/xlm-roberta-base" tokenizer_class = XLMRobertaTokenizer rust_tokenizer_class = XLMRobertaTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/xlnet/test_tokenization_xlnet.py b/tests/models/xlnet/test_tokenization_xlnet.py index 8a7476fad9..bd65e6c80b 100644 --- a/tests/models/xlnet/test_tokenization_xlnet.py +++ b/tests/models/xlnet/test_tokenization_xlnet.py @@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): + from_pretrained_id = "xlnet/xlnet-base-cased" tokenizer_class = XLNetTokenizer rust_tokenizer_class = XLNetTokenizerFast test_rust_tokenizer = True diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index d0c5874911..6c900fa72c 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -186,6 +186,7 @@ class TokenizerTesterMixin: space_between_special_tokens = False from_pretrained_kwargs = None from_pretrained_filter = None + from_pretrained_id = None from_pretrained_vocab_key = "vocab_file" test_seq2seq = True @@ -200,19 +201,13 @@ class TokenizerTesterMixin: # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # information available in Tokenizer (name, rust class, python class, vocab key name) if self.test_rust_tokenizer: - tokenizers_list = [ + self.tokenizers_list = [ ( self.rust_tokenizer_class, - pretrained_name, + self.from_pretrained_id, self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {}, ) - for pretrained_name in self.rust_tokenizer_class.pretrained_vocab_files_map[ - self.from_pretrained_vocab_key - ].keys() - if self.from_pretrained_filter is None - or (self.from_pretrained_filter is not None and self.from_pretrained_filter(pretrained_name)) ] - self.tokenizers_list = tokenizers_list[:1] # Let's just test the first pretrained vocab for speed else: self.tokenizers_list = [] with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data: