From 6494910f2741befae281388db0d9eacfbe82fad3 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 19 Nov 2020 16:44:20 -0500 Subject: [PATCH] Add sentencepiece to the CI and fix tests (#8672) * Fix the CI and tests * Fix quality * Remove that m form nowhere --- .circleci/config.yml | 18 +++++++++--------- .../models/mbart/tokenization_mbart.py | 2 +- .../models/mbart/tokenization_mbart_fast.py | 2 +- src/transformers/models/t5/tokenization_t5.py | 2 +- .../models/t5/tokenization_t5_fast.py | 2 +- tests/test_tokenization_mbart.py | 11 ++--------- 6 files changed, 15 insertions(+), 22 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index da3b767271..0fa61008ce 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -77,7 +77,7 @@ jobs: - v0.4-torch_and_tf-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - - run: pip install .[sklearn,tf-cpu,torch,testing] + - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece] - save_cache: key: v0.4-{{ checksum "setup.py" }} paths: @@ -103,7 +103,7 @@ jobs: - v0.4-torch-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - - run: pip install .[sklearn,torch,testing] + - run: pip install .[sklearn,torch,testing,sentencepiece] - save_cache: key: v0.4-torch-{{ checksum "setup.py" }} paths: @@ -129,7 +129,7 @@ jobs: - v0.4-tf-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - - run: pip install .[sklearn,tf-cpu,testing] + - run: pip install .[sklearn,tf-cpu,testing,sentencepiece] - save_cache: key: v0.4-tf-{{ checksum "setup.py" }} paths: @@ -155,7 +155,7 @@ jobs: - v0.4-flax-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - - run: sudo pip install .[flax,sklearn,torch,testing] + - run: sudo pip install .[flax,sklearn,torch,testing,sentencepiece] - save_cache: key: v0.4-flax-{{ checksum "setup.py" }} paths: @@ -181,7 +181,7 @@ jobs: - v0.4-torch-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - - run: pip install .[sklearn,torch,testing] + - run: pip install .[sklearn,torch,testing,sentencepiece] - save_cache: key: v0.4-torch-{{ checksum "setup.py" }} paths: @@ -207,7 +207,7 @@ jobs: - v0.4-tf-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - - run: pip install .[sklearn,tf-cpu,testing] + - run: pip install .[sklearn,tf-cpu,testing,sentencepiece] - save_cache: key: v0.4-tf-{{ checksum "setup.py" }} paths: @@ -231,7 +231,7 @@ jobs: - v0.4-custom_tokenizers-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - - run: pip install .[ja,testing] + - run: pip install .[ja,testing,sentencepiece] - run: python -m unidic download - save_cache: key: v0.4-custom_tokenizers-{{ checksum "setup.py" }} @@ -258,7 +258,7 @@ jobs: - v0.4-torch_examples-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - - run: pip install .[sklearn,torch,testing] + - run: pip install .[sklearn,torch,sentencepiece,testing] - run: pip install -r examples/requirements.txt - save_cache: key: v0.4-torch_examples-{{ checksum "setup.py" }} @@ -324,7 +324,7 @@ jobs: - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - run: pip install isort - - run: pip install .[tf,torch,flax,quality] + - run: pip install .[all,quality] - save_cache: key: v0.4-code_quality-{{ checksum "setup.py" }} paths: diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py index 468d218ed3..d96e69fe30 100644 --- a/src/transformers/models/mbart/tokenization_mbart.py +++ b/src/transformers/models/mbart/tokenization_mbart.py @@ -188,7 +188,7 @@ class MBartTokenizer(XLMRobertaTokenizer): **kwargs, ) -> BatchEncoding: if max_length is None: - max_length = self.max_len + max_length = self.model_max_length self.set_src_lang_special_tokens(src_lang) model_inputs: BatchEncoding = self( src_texts, diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py index 14b6e4919b..879c876afc 100644 --- a/src/transformers/models/mbart/tokenization_mbart_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart_fast.py @@ -185,7 +185,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast): **kwargs, ) -> BatchEncoding: if max_length is None: - max_length = self.max_len + max_length = self.model_max_length self.set_src_lang_special_tokens(src_lang) model_inputs: BatchEncoding = self( src_texts, diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 95359e3ac7..c4d57e0ac1 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -309,7 +309,7 @@ class T5Tokenizer(PreTrainedTokenizer): **kwargs, ) -> BatchEncoding: if max_length is None: - max_length = self.max_len + max_length = self.model_max_length model_inputs = self( src_texts, add_special_tokens=True, diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py index 7ae47bd3ad..5b259ea087 100644 --- a/src/transformers/models/t5/tokenization_t5_fast.py +++ b/src/transformers/models/t5/tokenization_t5_fast.py @@ -226,7 +226,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast): **kwargs, ) -> BatchEncoding: if max_length is None: - max_length = self.max_len + max_length = self.model_max_length self.prefix_tokens = [] model_inputs = self( src_texts, diff --git a/tests/test_tokenization_mbart.py b/tests/test_tokenization_mbart.py index f41925e0b9..64c6c92b27 100644 --- a/tests/test_tokenization_mbart.py +++ b/tests/test_tokenization_mbart.py @@ -1,14 +1,7 @@ import tempfile import unittest -from transformers import ( - SPIECE_UNDERLINE, - AutoTokenizer, - BatchEncoding, - MBartTokenizer, - MBartTokenizerFast, - is_torch_available, -) +from transformers import SPIECE_UNDERLINE, BatchEncoding, MBartTokenizer, MBartTokenizerFast, is_torch_available from transformers.testing_utils import ( _sentencepiece_available, require_sentencepiece, @@ -138,7 +131,7 @@ class MBartEnroIntegrationTest(unittest.TestCase): @classmethod def setUpClass(cls): - cls.tokenizer: MBartTokenizer = AutoTokenizer.from_pretrained(cls.checkpoint_name) + cls.tokenizer: MBartTokenizer = MBartTokenizer.from_pretrained(cls.checkpoint_name) cls.pad_token_id = 1 return cls