Testing that batch_encode_plus is the same as encode_plus (#2973)

* Testing that encode_plus and batch_encode_plus behave the same way Spoiler alert: they don't * Testing rest of arguments in batch_encode_plus * Test tensor return in batch_encode_plus * Addressing Sam's comments * flake8 * Simplified with `num_added_tokens`
2020-02-24 12:09:46 -05:00
parent 17c45c39ed
commit 21d8b6a33e
3 changed files with 222 additions and 39 deletions
--- a/src/transformers/tokenization_t5.py
+++ b/src/transformers/tokenization_t5.py
@@ -98,6 +98,12 @@ class T5Tokenizer(PreTrainedTokenizer):
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )
+        self.max_len_single_sentence = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = (
+            self.max_len
+        )  # no default special tokens - you can update this value if you add special tokens

        try:
            import sentencepiece as spm