diff --git a/templates/adding_a_new_model/tests/test_tokenization_xxx.py b/templates/adding_a_new_model/tests/test_tokenization_xxx.py
index 2a7b58edb6..bbfe256db7 100644
--- a/templates/adding_a_new_model/tests/test_tokenization_xxx.py
+++ b/templates/adding_a_new_model/tests/test_tokenization_xxx.py
@@ -15,14 +15,15 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
+import unittest
from io import open
from transformers.tokenization_bert import VOCAB_FILES_NAMES, XxxTokenizer
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
-class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester):
+class XxxTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XxxTokenizer
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index 16bf9ea572..39c8d6ce51 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function
import json
import os
-from .test_tokenization_commo import TemporaryDirectory
+from .test_tokenization_common import TemporaryDirectory
class ConfigTester(object):
diff --git a/tests/test_model_card.py b/tests/test_model_card.py
index 8f4b1d24b7..d6cece5f15 100644
--- a/tests/test_model_card.py
+++ b/tests/test_model_card.py
@@ -20,7 +20,7 @@ import unittest
from transformers.modelcard import ModelCard
-from .test_tokenization_commo import TemporaryDirectory
+from .test_tokenization_common import TemporaryDirectory
class ModelCardTester(unittest.TestCase):
diff --git a/tests/test_optimization.py b/tests/test_optimization.py
index fa628be8ef..d6871420f4 100644
--- a/tests/test_optimization.py
+++ b/tests/test_optimization.py
@@ -19,7 +19,7 @@ import unittest
from transformers import is_torch_available
-from .test_tokenization_commo import TemporaryDirectory
+from .test_tokenization_common import TemporaryDirectory
from .utils import require_torch
diff --git a/tests/test_tokenization_albert.py b/tests/test_tokenization_albert.py
index 3a3c47537f..834cfef58c 100644
--- a/tests/test_tokenization_albert.py
+++ b/tests/test_tokenization_albert.py
@@ -15,16 +15,17 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
+import unittest
from transformers.tokenization_albert import AlbertTokenizer
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model")
-class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester):
+class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = AlbertTokenizer
diff --git a/tests/test_tokenization_bert.py b/tests/test_tokenization_bert.py
index 4b1cb5b9c9..6d64beb9b0 100644
--- a/tests/test_tokenization_bert.py
+++ b/tests/test_tokenization_bert.py
@@ -15,6 +15,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
+import unittest
from io import open
from transformers.tokenization_bert import (
@@ -27,11 +28,11 @@ from transformers.tokenization_bert import (
_is_whitespace,
)
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
from .utils import slow
-class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
+class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertTokenizer
diff --git a/tests/test_tokenization_bert_japanese.py b/tests/test_tokenization_bert_japanese.py
index 519cc199f8..f8947e2a66 100644
--- a/tests/test_tokenization_bert_japanese.py
+++ b/tests/test_tokenization_bert_japanese.py
@@ -15,6 +15,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
+import unittest
from io import open
from transformers.tokenization_bert import WordpieceTokenizer
@@ -25,12 +26,12 @@ from transformers.tokenization_bert_japanese import (
MecabTokenizer,
)
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
from .utils import custom_tokenizers, slow
@custom_tokenizers
-class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
+class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertJapaneseTokenizer
@@ -130,7 +131,7 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert encoded_pair == [2] + text + [3] + text_2 + [3]
-class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTester):
+class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertJapaneseTokenizer
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 79b4bf7810..60ae6b523d 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -18,7 +18,6 @@ import os
import shutil
import sys
import tempfile
-import unittest
from io import open
@@ -43,489 +42,479 @@ else:
unicode = str
-class CommonTestCases:
- class CommonTokenizerTester(unittest.TestCase):
+class TokenizerTesterMixin:
- tokenizer_class = None
+ tokenizer_class = None
- def setUp(self):
- self.tmpdirname = tempfile.mkdtemp()
+ def setUp(self):
+ self.tmpdirname = tempfile.mkdtemp()
- def tearDown(self):
- shutil.rmtree(self.tmpdirname)
+ def tearDown(self):
+ shutil.rmtree(self.tmpdirname)
- def get_tokenizer(self, **kwargs):
- raise NotImplementedError
+ def get_tokenizer(self, **kwargs):
+ raise NotImplementedError
- def get_input_output_texts(self):
- raise NotImplementedError
+ def get_input_output_texts(self):
+ raise NotImplementedError
- def test_tokenizers_common_properties(self):
- tokenizer = self.get_tokenizer()
- attributes_list = [
- "bos_token",
- "eos_token",
- "unk_token",
- "sep_token",
- "pad_token",
- "cls_token",
- "mask_token",
- ]
- for attr in attributes_list:
- self.assertTrue(hasattr(tokenizer, attr))
- self.assertTrue(hasattr(tokenizer, attr + "_id"))
+ def test_tokenizers_common_properties(self):
+ tokenizer = self.get_tokenizer()
+ attributes_list = [
+ "bos_token",
+ "eos_token",
+ "unk_token",
+ "sep_token",
+ "pad_token",
+ "cls_token",
+ "mask_token",
+ ]
+ for attr in attributes_list:
+ self.assertTrue(hasattr(tokenizer, attr))
+ self.assertTrue(hasattr(tokenizer, attr + "_id"))
- self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
- self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
+ self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
+ self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
- attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"]
- for attr in attributes_list:
- self.assertTrue(hasattr(tokenizer, attr))
+ attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"]
+ for attr in attributes_list:
+ self.assertTrue(hasattr(tokenizer, attr))
- def test_save_and_load_tokenizer(self):
- # safety check on max_len default value so we are sure the test works
- tokenizer = self.get_tokenizer()
- self.assertNotEqual(tokenizer.max_len, 42)
+ def test_save_and_load_tokenizer(self):
+ # safety check on max_len default value so we are sure the test works
+ tokenizer = self.get_tokenizer()
+ self.assertNotEqual(tokenizer.max_len, 42)
- # Now let's start the test
- tokenizer = self.get_tokenizer(max_len=42)
+ # Now let's start the test
+ tokenizer = self.get_tokenizer(max_len=42)
- before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
+ before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
- with TemporaryDirectory() as tmpdirname:
- tokenizer.save_pretrained(tmpdirname)
- tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
+ with TemporaryDirectory() as tmpdirname:
+ tokenizer.save_pretrained(tmpdirname)
+ tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
- after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
- self.assertListEqual(before_tokens, after_tokens)
+ after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
+ self.assertListEqual(before_tokens, after_tokens)
- self.assertEqual(tokenizer.max_len, 42)
- tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
- self.assertEqual(tokenizer.max_len, 43)
+ self.assertEqual(tokenizer.max_len, 42)
+ tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
+ self.assertEqual(tokenizer.max_len, 43)
- def test_pickle_tokenizer(self):
- tokenizer = self.get_tokenizer()
- self.assertIsNotNone(tokenizer)
+ def test_pickle_tokenizer(self):
+ tokenizer = self.get_tokenizer()
+ self.assertIsNotNone(tokenizer)
- text = "Munich and Berlin are nice cities"
- subwords = tokenizer.tokenize(text)
+ text = "Munich and Berlin are nice cities"
+ subwords = tokenizer.tokenize(text)
- with TemporaryDirectory() as tmpdirname:
+ with TemporaryDirectory() as tmpdirname:
- filename = os.path.join(tmpdirname, "tokenizer.bin")
- with open(filename, "wb") as handle:
- pickle.dump(tokenizer, handle)
+ filename = os.path.join(tmpdirname, "tokenizer.bin")
+ with open(filename, "wb") as handle:
+ pickle.dump(tokenizer, handle)
- with open(filename, "rb") as handle:
- tokenizer_new = pickle.load(handle)
+ with open(filename, "rb") as handle:
+ tokenizer_new = pickle.load(handle)
- subwords_loaded = tokenizer_new.tokenize(text)
+ subwords_loaded = tokenizer_new.tokenize(text)
- self.assertListEqual(subwords, subwords_loaded)
+ self.assertListEqual(subwords, subwords_loaded)
- def test_added_tokens_do_lower_case(self):
- tokenizer = self.get_tokenizer(do_lower_case=True)
+ def test_added_tokens_do_lower_case(self):
+ tokenizer = self.get_tokenizer(do_lower_case=True)
- special_token = tokenizer.all_special_tokens[0]
+ special_token = tokenizer.all_special_tokens[0]
- text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
- text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
+ text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
+ text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
- toks0 = tokenizer.tokenize(text) # toks before adding new_toks
+ toks0 = tokenizer.tokenize(text) # toks before adding new_toks
- new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
- added = tokenizer.add_tokens(new_toks)
- self.assertEqual(added, 2)
+ new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
+ added = tokenizer.add_tokens(new_toks)
+ self.assertEqual(added, 2)
- toks = tokenizer.tokenize(text)
- toks2 = tokenizer.tokenize(text2)
+ toks = tokenizer.tokenize(text)
+ toks2 = tokenizer.tokenize(text2)
- self.assertEqual(len(toks), len(toks2))
- self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer
- self.assertListEqual(toks, toks2)
+ self.assertEqual(len(toks), len(toks2))
+ self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer
+ self.assertListEqual(toks, toks2)
- # Check that none of the special tokens are lowercased
- sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
- tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens)
+ # Check that none of the special tokens are lowercased
+ sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
+ tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens)
- for special_token in tokenizer.all_special_tokens:
- self.assertTrue(special_token in tokenized_sequence)
+ for special_token in tokenizer.all_special_tokens:
+ self.assertTrue(special_token in tokenized_sequence)
- tokenizer = self.get_tokenizer(do_lower_case=False)
+ tokenizer = self.get_tokenizer(do_lower_case=False)
- added = tokenizer.add_tokens(new_toks)
- self.assertEqual(added, 4)
+ added = tokenizer.add_tokens(new_toks)
+ self.assertEqual(added, 4)
- toks = tokenizer.tokenize(text)
- toks2 = tokenizer.tokenize(text2)
+ toks = tokenizer.tokenize(text)
+ toks2 = tokenizer.tokenize(text2)
- self.assertEqual(len(toks), len(toks2)) # Length should still be the same
- self.assertNotEqual(len(toks), len(toks0))
- self.assertNotEqual(toks[1], toks2[1]) # But at least the first non-special tokens should differ
+ self.assertEqual(len(toks), len(toks2)) # Length should still be the same
+ self.assertNotEqual(len(toks), len(toks0))
+ self.assertNotEqual(toks[1], toks2[1]) # But at least the first non-special tokens should differ
- def test_add_tokens_tokenizer(self):
- tokenizer = self.get_tokenizer()
+ def test_add_tokens_tokenizer(self):
+ tokenizer = self.get_tokenizer()
- vocab_size = tokenizer.vocab_size
- all_size = len(tokenizer)
+ vocab_size = tokenizer.vocab_size
+ all_size = len(tokenizer)
- self.assertNotEqual(vocab_size, 0)
- self.assertEqual(vocab_size, all_size)
+ self.assertNotEqual(vocab_size, 0)
+ self.assertEqual(vocab_size, all_size)
- new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
- added_toks = tokenizer.add_tokens(new_toks)
- vocab_size_2 = tokenizer.vocab_size
- all_size_2 = len(tokenizer)
+ new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
+ added_toks = tokenizer.add_tokens(new_toks)
+ vocab_size_2 = tokenizer.vocab_size
+ all_size_2 = len(tokenizer)
- self.assertNotEqual(vocab_size_2, 0)
- self.assertEqual(vocab_size, vocab_size_2)
- self.assertEqual(added_toks, len(new_toks))
- self.assertEqual(all_size_2, all_size + len(new_toks))
+ self.assertNotEqual(vocab_size_2, 0)
+ self.assertEqual(vocab_size, vocab_size_2)
+ self.assertEqual(added_toks, len(new_toks))
+ self.assertEqual(all_size_2, all_size + len(new_toks))
- tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
- out_string = tokenizer.decode(tokens)
+ tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
+ out_string = tokenizer.decode(tokens)
- self.assertGreaterEqual(len(tokens), 4)
- self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
- self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+ self.assertGreaterEqual(len(tokens), 4)
+ self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+ self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
- new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
- added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
- vocab_size_3 = tokenizer.vocab_size
- all_size_3 = len(tokenizer)
+ new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
+ added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+ vocab_size_3 = tokenizer.vocab_size
+ all_size_3 = len(tokenizer)
- self.assertNotEqual(vocab_size_3, 0)
- self.assertEqual(vocab_size, vocab_size_3)
- self.assertEqual(added_toks_2, len(new_toks_2))
- self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+ self.assertNotEqual(vocab_size_3, 0)
+ self.assertEqual(vocab_size, vocab_size_3)
+ self.assertEqual(added_toks_2, len(new_toks_2))
+ self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
- tokens = tokenizer.encode(
- ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
- )
- out_string = tokenizer.decode(tokens)
+ tokens = tokenizer.encode(
+ ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
+ )
+ out_string = tokenizer.decode(tokens)
- self.assertGreaterEqual(len(tokens), 6)
- self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
- self.assertGreater(tokens[0], tokens[1])
- self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
- self.assertGreater(tokens[-2], tokens[-3])
- self.assertEqual(tokens[0], tokenizer.eos_token_id)
- self.assertEqual(tokens[-2], tokenizer.pad_token_id)
+ self.assertGreaterEqual(len(tokens), 6)
+ self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+ self.assertGreater(tokens[0], tokens[1])
+ self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+ self.assertGreater(tokens[-2], tokens[-3])
+ self.assertEqual(tokens[0], tokenizer.eos_token_id)
+ self.assertEqual(tokens[-2], tokenizer.pad_token_id)
- def test_add_special_tokens(self):
- tokenizer = self.get_tokenizer()
- input_text, output_text = self.get_input_output_texts()
+ def test_add_special_tokens(self):
+ tokenizer = self.get_tokenizer()
+ input_text, output_text = self.get_input_output_texts()
- special_token = "[SPECIAL TOKEN]"
+ special_token = "[SPECIAL TOKEN]"
- tokenizer.add_special_tokens({"cls_token": special_token})
- encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
- assert len(encoded_special_token) == 1
+ tokenizer.add_special_tokens({"cls_token": special_token})
+ encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
+ assert len(encoded_special_token) == 1
- text = " ".join([input_text, special_token, output_text])
- encoded = tokenizer.encode(text, add_special_tokens=False)
+ text = " ".join([input_text, special_token, output_text])
+ encoded = tokenizer.encode(text, add_special_tokens=False)
- input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
- output_encoded = tokenizer.encode(output_text, add_special_tokens=False)
- special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
- assert encoded == input_encoded + special_token_id + output_encoded
+ input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
+ output_encoded = tokenizer.encode(output_text, add_special_tokens=False)
+ special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
+ assert encoded == input_encoded + special_token_id + output_encoded
- decoded = tokenizer.decode(encoded, skip_special_tokens=True)
- assert special_token not in decoded
+ decoded = tokenizer.decode(encoded, skip_special_tokens=True)
+ assert special_token not in decoded
- def test_required_methods_tokenizer(self):
- tokenizer = self.get_tokenizer()
- input_text, output_text = self.get_input_output_texts()
+ def test_required_methods_tokenizer(self):
+ tokenizer = self.get_tokenizer()
+ input_text, output_text = self.get_input_output_texts()
- tokens = tokenizer.tokenize(input_text)
- ids = tokenizer.convert_tokens_to_ids(tokens)
- ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
- self.assertListEqual(ids, ids_2)
+ tokens = tokenizer.tokenize(input_text)
+ ids = tokenizer.convert_tokens_to_ids(tokens)
+ ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
+ self.assertListEqual(ids, ids_2)
- tokens_2 = tokenizer.convert_ids_to_tokens(ids)
- text_2 = tokenizer.decode(ids)
+ tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+ text_2 = tokenizer.decode(ids)
- self.assertEqual(text_2, output_text)
+ self.assertEqual(text_2, output_text)
- self.assertNotEqual(len(tokens_2), 0)
- self.assertIsInstance(text_2, (str, unicode))
+ self.assertNotEqual(len(tokens_2), 0)
+ self.assertIsInstance(text_2, (str, unicode))
- def test_encode_decode_with_spaces(self):
- tokenizer = self.get_tokenizer()
+ def test_encode_decode_with_spaces(self):
+ tokenizer = self.get_tokenizer()
- new_toks = ["[ABC]", "[DEF]", "GHI IHG"]
- tokenizer.add_tokens(new_toks)
- input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
- encoded = tokenizer.encode(input, add_special_tokens=False)
- decoded = tokenizer.decode(encoded)
- self.assertEqual(decoded, input)
+ new_toks = ["[ABC]", "[DEF]", "GHI IHG"]
+ tokenizer.add_tokens(new_toks)
+ input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
+ encoded = tokenizer.encode(input, add_special_tokens=False)
+ decoded = tokenizer.decode(encoded)
+ self.assertEqual(decoded, input)
- def test_pretrained_model_lists(self):
- weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
- weights_lists_2 = []
- for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
- weights_lists_2.append(list(map_list.keys()))
+ def test_pretrained_model_lists(self):
+ weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
+ weights_lists_2 = []
+ for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
+ weights_lists_2.append(list(map_list.keys()))
- for weights_list_2 in weights_lists_2:
- self.assertListEqual(weights_list, weights_list_2)
+ for weights_list_2 in weights_lists_2:
+ self.assertListEqual(weights_list, weights_list_2)
- def test_mask_output(self):
- if sys.version_info <= (3, 0):
- return
-
- tokenizer = self.get_tokenizer()
-
- if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
- seq_0 = "Test this method."
- seq_1 = "With these inputs."
- information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
- sequences, mask = information["input_ids"], information["token_type_ids"]
- self.assertEqual(len(sequences), len(mask))
-
- def test_number_of_added_tokens(self):
- tokenizer = self.get_tokenizer()
+ def test_mask_output(self):
+ if sys.version_info <= (3, 0):
+ return
+
+ tokenizer = self.get_tokenizer()
+ if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
seq_0 = "Test this method."
seq_1 = "With these inputs."
+ information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
+ sequences, mask = information["input_ids"], information["token_type_ids"]
+ self.assertEqual(len(sequences), len(mask))
- sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
- attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
+ def test_number_of_added_tokens(self):
+ tokenizer = self.get_tokenizer()
- # Method is implemented (e.g. not GPT-2)
- if len(attached_sequences) != 2:
- self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
+ seq_0 = "Test this method."
+ seq_1 = "With these inputs."
- def test_maximum_encoding_length_single_input(self):
- tokenizer = self.get_tokenizer()
+ sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
+ attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
- seq_0 = "This is a sentence to be encoded."
- stride = 2
+ # Method is implemented (e.g. not GPT-2)
+ if len(attached_sequences) != 2:
+ self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
- sequence = tokenizer.encode(seq_0, add_special_tokens=False)
- num_added_tokens = tokenizer.num_added_tokens()
- total_length = len(sequence) + num_added_tokens
- information = tokenizer.encode_plus(
- seq_0,
- max_length=total_length - 2,
- add_special_tokens=True,
- stride=stride,
- return_overflowing_tokens=True,
- )
+ def test_maximum_encoding_length_single_input(self):
+ tokenizer = self.get_tokenizer()
- truncated_sequence = information["input_ids"]
- overflowing_tokens = information["overflowing_tokens"]
+ seq_0 = "This is a sentence to be encoded."
+ stride = 2
- self.assertEqual(len(overflowing_tokens), 2 + stride)
- self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
- self.assertEqual(len(truncated_sequence), total_length - 2)
- self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
+ sequence = tokenizer.encode(seq_0, add_special_tokens=False)
+ num_added_tokens = tokenizer.num_added_tokens()
+ total_length = len(sequence) + num_added_tokens
+ information = tokenizer.encode_plus(
+ seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride, return_overflowing_tokens=True,
+ )
- def test_maximum_encoding_length_pair_input(self):
- tokenizer = self.get_tokenizer()
+ truncated_sequence = information["input_ids"]
+ overflowing_tokens = information["overflowing_tokens"]
- seq_0 = "This is a sentence to be encoded."
- seq_1 = "This is another sentence to be encoded."
- stride = 2
+ self.assertEqual(len(overflowing_tokens), 2 + stride)
+ self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
+ self.assertEqual(len(truncated_sequence), total_length - 2)
+ self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
- sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
- sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
+ def test_maximum_encoding_length_pair_input(self):
+ tokenizer = self.get_tokenizer()
- sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
- truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
- tokenizer.encode(seq_0, add_special_tokens=False),
- tokenizer.encode(seq_1, add_special_tokens=False)[:-2],
- )
+ seq_0 = "This is a sentence to be encoded."
+ seq_1 = "This is another sentence to be encoded."
+ stride = 2
- information = tokenizer.encode_plus(
- seq_0,
- seq_1,
- max_length=len(sequence) - 2,
- add_special_tokens=True,
- stride=stride,
- truncation_strategy="only_second",
- return_overflowing_tokens=True,
- )
- information_first_truncated = tokenizer.encode_plus(
- seq_0,
- seq_1,
- max_length=len(sequence) - 2,
- add_special_tokens=True,
- stride=stride,
- truncation_strategy="only_first",
- return_overflowing_tokens=True,
- )
+ sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
+ sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
- truncated_sequence = information["input_ids"]
- overflowing_tokens = information["overflowing_tokens"]
- overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
+ sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
+ truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
+ tokenizer.encode(seq_0, add_special_tokens=False), tokenizer.encode(seq_1, add_special_tokens=False)[:-2],
+ )
- self.assertEqual(len(overflowing_tokens), 2 + stride)
- self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :])
- self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :])
- self.assertEqual(len(truncated_sequence), len(sequence) - 2)
- self.assertEqual(truncated_sequence, truncated_second_sequence)
+ information = tokenizer.encode_plus(
+ seq_0,
+ seq_1,
+ max_length=len(sequence) - 2,
+ add_special_tokens=True,
+ stride=stride,
+ truncation_strategy="only_second",
+ return_overflowing_tokens=True,
+ )
+ information_first_truncated = tokenizer.encode_plus(
+ seq_0,
+ seq_1,
+ max_length=len(sequence) - 2,
+ add_special_tokens=True,
+ stride=stride,
+ truncation_strategy="only_first",
+ return_overflowing_tokens=True,
+ )
- def test_encode_input_type(self):
- tokenizer = self.get_tokenizer()
+ truncated_sequence = information["input_ids"]
+ overflowing_tokens = information["overflowing_tokens"]
+ overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
- sequence = "Let's encode this sequence"
+ self.assertEqual(len(overflowing_tokens), 2 + stride)
+ self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :])
+ self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :])
+ self.assertEqual(len(truncated_sequence), len(sequence) - 2)
+ self.assertEqual(truncated_sequence, truncated_second_sequence)
- tokens = tokenizer.tokenize(sequence)
- input_ids = tokenizer.convert_tokens_to_ids(tokens)
- formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
+ def test_encode_input_type(self):
+ tokenizer = self.get_tokenizer()
- self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
- self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
+ sequence = "Let's encode this sequence"
- def test_special_tokens_mask(self):
- tokenizer = self.get_tokenizer()
+ tokens = tokenizer.tokenize(sequence)
+ input_ids = tokenizer.convert_tokens_to_ids(tokens)
+ formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
- sequence_0 = "Encode this."
- sequence_1 = "This one too please."
+ self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
+ self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
- # Testing single inputs
- encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
- encoded_sequence_dict = tokenizer.encode_plus(
- sequence_0, add_special_tokens=True, return_special_tokens_mask=True
- )
- encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
- special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
- self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+ def test_special_tokens_mask(self):
+ tokenizer = self.get_tokenizer()
- filtered_sequence = [
- (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
- ]
- filtered_sequence = [x for x in filtered_sequence if x is not None]
- self.assertEqual(encoded_sequence, filtered_sequence)
+ sequence_0 = "Encode this."
+ sequence_1 = "This one too please."
- # Testing inputs pairs
- encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(
- sequence_1, add_special_tokens=False
- )
- encoded_sequence_dict = tokenizer.encode_plus(
- sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True
- )
- encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
- special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
- self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+ # Testing single inputs
+ encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
+ encoded_sequence_dict = tokenizer.encode_plus(
+ sequence_0, add_special_tokens=True, return_special_tokens_mask=True
+ )
+ encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+ special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+ self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
- filtered_sequence = [
- (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
- ]
- filtered_sequence = [x for x in filtered_sequence if x is not None]
- self.assertEqual(encoded_sequence, filtered_sequence)
+ filtered_sequence = [
+ (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
+ ]
+ filtered_sequence = [x for x in filtered_sequence if x is not None]
+ self.assertEqual(encoded_sequence, filtered_sequence)
- # Testing with already existing special tokens
- if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
- tokenizer.add_special_tokens({"cls_token": "", "sep_token": ""})
- encoded_sequence_dict = tokenizer.encode_plus(
- sequence_0, add_special_tokens=True, return_special_tokens_mask=True
- )
- encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
- special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
- special_tokens_mask = tokenizer.get_special_tokens_mask(
- encoded_sequence_w_special, already_has_special_tokens=True
- )
- self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
- self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
+ # Testing inputs pairs
+ encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(
+ sequence_1, add_special_tokens=False
+ )
+ encoded_sequence_dict = tokenizer.encode_plus(
+ sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True
+ )
+ encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+ special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
+ self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
- def test_padding_to_max_length(self):
- tokenizer = self.get_tokenizer()
+ filtered_sequence = [
+ (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
+ ]
+ filtered_sequence = [x for x in filtered_sequence if x is not None]
+ self.assertEqual(encoded_sequence, filtered_sequence)
- sequence = "Sequence"
- padding_size = 10
- padding_idx = tokenizer.pad_token_id
+ # Testing with already existing special tokens
+ if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
+ tokenizer.add_special_tokens({"cls_token": "", "sep_token": ""})
+ encoded_sequence_dict = tokenizer.encode_plus(
+ sequence_0, add_special_tokens=True, return_special_tokens_mask=True
+ )
+ encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
+ special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
+ special_tokens_mask = tokenizer.get_special_tokens_mask(
+ encoded_sequence_w_special, already_has_special_tokens=True
+ )
+ self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
+ self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
- # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
- tokenizer.padding_side = "right"
- encoded_sequence = tokenizer.encode(sequence)
- sequence_length = len(encoded_sequence)
- padded_sequence = tokenizer.encode(
- sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
- )
- padded_sequence_length = len(padded_sequence)
- assert sequence_length + padding_size == padded_sequence_length
- assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
+ def test_padding_to_max_length(self):
+ tokenizer = self.get_tokenizer()
- # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
- tokenizer.padding_side = "left"
- encoded_sequence = tokenizer.encode(sequence)
- sequence_length = len(encoded_sequence)
- padded_sequence = tokenizer.encode(
- sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
- )
- padded_sequence_length = len(padded_sequence)
- assert sequence_length + padding_size == padded_sequence_length
- assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
+ sequence = "Sequence"
+ padding_size = 10
+ padding_idx = tokenizer.pad_token_id
- # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
- encoded_sequence = tokenizer.encode(sequence)
- sequence_length = len(encoded_sequence)
+ # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+ tokenizer.padding_side = "right"
+ encoded_sequence = tokenizer.encode(sequence)
+ sequence_length = len(encoded_sequence)
+ padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
+ padded_sequence_length = len(padded_sequence)
+ assert sequence_length + padding_size == padded_sequence_length
+ assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
- tokenizer.padding_side = "right"
- padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
- padded_sequence_right_length = len(padded_sequence_right)
+ # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
+ tokenizer.padding_side = "left"
+ encoded_sequence = tokenizer.encode(sequence)
+ sequence_length = len(encoded_sequence)
+ padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
+ padded_sequence_length = len(padded_sequence)
+ assert sequence_length + padding_size == padded_sequence_length
+ assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
- tokenizer.padding_side = "left"
- padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True)
- padded_sequence_left_length = len(padded_sequence_left)
+ # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
+ encoded_sequence = tokenizer.encode(sequence)
+ sequence_length = len(encoded_sequence)
- assert sequence_length == padded_sequence_right_length
- assert encoded_sequence == padded_sequence_right
- assert sequence_length == padded_sequence_left_length
- assert encoded_sequence == padded_sequence_left
+ tokenizer.padding_side = "right"
+ padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
+ padded_sequence_right_length = len(padded_sequence_right)
- def test_encode_plus_with_padding(self):
- tokenizer = self.get_tokenizer()
+ tokenizer.padding_side = "left"
+ padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True)
+ padded_sequence_left_length = len(padded_sequence_left)
- sequence = "Sequence"
- padding_size = 10
- padding_idx = tokenizer.pad_token_id
- token_type_padding_idx = tokenizer.pad_token_type_id
+ assert sequence_length == padded_sequence_right_length
+ assert encoded_sequence == padded_sequence_right
+ assert sequence_length == padded_sequence_left_length
+ assert encoded_sequence == padded_sequence_left
- encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
- input_ids = encoded_sequence["input_ids"]
- token_type_ids = encoded_sequence["token_type_ids"]
- attention_mask = encoded_sequence["attention_mask"]
- special_tokens_mask = encoded_sequence["special_tokens_mask"]
- sequence_length = len(input_ids)
+ def test_encode_plus_with_padding(self):
+ tokenizer = self.get_tokenizer()
- # Test right padding
- tokenizer.padding_side = "right"
- padded_sequence = tokenizer.encode_plus(
- sequence,
- max_length=sequence_length + padding_size,
- pad_to_max_length=True,
- return_special_tokens_mask=True,
- )
- padded_input_ids = padded_sequence["input_ids"]
- padded_token_type_ids = padded_sequence["token_type_ids"]
- padded_attention_mask = padded_sequence["attention_mask"]
- padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
- padded_sequence_length = len(padded_input_ids)
+ sequence = "Sequence"
+ padding_size = 10
+ padding_idx = tokenizer.pad_token_id
+ token_type_padding_idx = tokenizer.pad_token_type_id
- assert sequence_length + padding_size == padded_sequence_length
- assert input_ids + [padding_idx] * padding_size == padded_input_ids
- assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
- assert attention_mask + [0] * padding_size == padded_attention_mask
- assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
+ encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
+ input_ids = encoded_sequence["input_ids"]
+ token_type_ids = encoded_sequence["token_type_ids"]
+ attention_mask = encoded_sequence["attention_mask"]
+ special_tokens_mask = encoded_sequence["special_tokens_mask"]
+ sequence_length = len(input_ids)
- # Test left padding
- tokenizer.padding_side = "left"
- padded_sequence = tokenizer.encode_plus(
- sequence,
- max_length=sequence_length + padding_size,
- pad_to_max_length=True,
- return_special_tokens_mask=True,
- )
- padded_input_ids = padded_sequence["input_ids"]
- padded_token_type_ids = padded_sequence["token_type_ids"]
- padded_attention_mask = padded_sequence["attention_mask"]
- padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
- padded_sequence_length = len(padded_input_ids)
+ # Test right padding
+ tokenizer.padding_side = "right"
+ padded_sequence = tokenizer.encode_plus(
+ sequence,
+ max_length=sequence_length + padding_size,
+ pad_to_max_length=True,
+ return_special_tokens_mask=True,
+ )
+ padded_input_ids = padded_sequence["input_ids"]
+ padded_token_type_ids = padded_sequence["token_type_ids"]
+ padded_attention_mask = padded_sequence["attention_mask"]
+ padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
+ padded_sequence_length = len(padded_input_ids)
- assert sequence_length + padding_size == padded_sequence_length
- assert [padding_idx] * padding_size + input_ids == padded_input_ids
- assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
- assert [0] * padding_size + attention_mask == padded_attention_mask
- assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
+ assert sequence_length + padding_size == padded_sequence_length
+ assert input_ids + [padding_idx] * padding_size == padded_input_ids
+ assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
+ assert attention_mask + [0] * padding_size == padded_attention_mask
+ assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
+
+ # Test left padding
+ tokenizer.padding_side = "left"
+ padded_sequence = tokenizer.encode_plus(
+ sequence,
+ max_length=sequence_length + padding_size,
+ pad_to_max_length=True,
+ return_special_tokens_mask=True,
+ )
+ padded_input_ids = padded_sequence["input_ids"]
+ padded_token_type_ids = padded_sequence["token_type_ids"]
+ padded_attention_mask = padded_sequence["attention_mask"]
+ padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
+ padded_sequence_length = len(padded_input_ids)
+
+ assert sequence_length + padding_size == padded_sequence_length
+ assert [padding_idx] * padding_size + input_ids == padded_input_ids
+ assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
+ assert [0] * padding_size + attention_mask == padded_attention_mask
+ assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
diff --git a/tests/test_tokenization_ctrl.py b/tests/test_tokenization_ctrl.py
index 77ff6a86ea..89d9f78024 100644
--- a/tests/test_tokenization_ctrl.py
+++ b/tests/test_tokenization_ctrl.py
@@ -15,14 +15,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json
import os
+import unittest
from io import open
from transformers.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
-class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
+class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CTRLTokenizer
diff --git a/tests/test_tokenization_gpt2.py b/tests/test_tokenization_gpt2.py
index fbc45738f0..3dea20d8c5 100644
--- a/tests/test_tokenization_gpt2.py
+++ b/tests/test_tokenization_gpt2.py
@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json
import os
+import unittest
from io import open
from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES, GPT2Tokenizer
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
-class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
+class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = GPT2Tokenizer
diff --git a/tests/test_tokenization_openai.py b/tests/test_tokenization_openai.py
index a9e8cc38ea..fed01c9efe 100644
--- a/tests/test_tokenization_openai.py
+++ b/tests/test_tokenization_openai.py
@@ -16,13 +16,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json
import os
+import unittest
from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
-class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
+class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = OpenAIGPTTokenizer
diff --git a/tests/test_tokenization_roberta.py b/tests/test_tokenization_roberta.py
index bacfd51555..da60063356 100644
--- a/tests/test_tokenization_roberta.py
+++ b/tests/test_tokenization_roberta.py
@@ -16,15 +16,16 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json
import os
+import unittest
from io import open
from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
from .utils import slow
-class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
+class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = RobertaTokenizer
def setUp(self):
diff --git a/tests/test_tokenization_t5.py b/tests/test_tokenization_t5.py
index 68ceebb83c..62a1d148a7 100644
--- a/tests/test_tokenization_t5.py
+++ b/tests/test_tokenization_t5.py
@@ -15,17 +15,18 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
+import unittest
from transformers.tokenization_t5 import T5Tokenizer
from transformers.tokenization_xlnet import SPIECE_UNDERLINE
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
-class T5TokenizationTest(CommonTestCases.CommonTokenizerTester):
+class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = T5Tokenizer
diff --git a/tests/test_tokenization_transfo_xl.py b/tests/test_tokenization_transfo_xl.py
index 1d275f591f..cd6c7c58f9 100644
--- a/tests/test_tokenization_transfo_xl.py
+++ b/tests/test_tokenization_transfo_xl.py
@@ -15,11 +15,12 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
+import unittest
from io import open
from transformers import is_torch_available
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
from .utils import require_torch
@@ -28,7 +29,7 @@ if is_torch_available():
@require_torch
-class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
+class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
diff --git a/tests/test_tokenization_xlm.py b/tests/test_tokenization_xlm.py
index 12bff7f618..9f9dd8fbc4 100644
--- a/tests/test_tokenization_xlm.py
+++ b/tests/test_tokenization_xlm.py
@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json
import os
+import unittest
from transformers.tokenization_xlm import VOCAB_FILES_NAMES, XLMTokenizer
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
from .utils import slow
-class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
+class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMTokenizer
diff --git a/tests/test_tokenization_xlnet.py b/tests/test_tokenization_xlnet.py
index ed6430959e..dcc270e683 100644
--- a/tests/test_tokenization_xlnet.py
+++ b/tests/test_tokenization_xlnet.py
@@ -15,17 +15,18 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
+import unittest
from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
-from .test_tokenization_commo import CommonTestCases
+from .test_tokenization_common import TokenizerTesterMixin
from .utils import slow
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
-class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
+class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLNetTokenizer