From 00204f2b4cf52eb9541841c79949bd8f29728b47 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Sun, 22 Dec 2019 15:34:15 +0100 Subject: [PATCH] Replace CommonTestCases for tokenizers with a mixin. This is the same change as for (TF)CommonTestCases for modeling. --- .../tests/test_tokenization_xxx.py | 5 +- tests/test_configuration_common.py | 2 +- tests/test_model_card.py | 2 +- tests/test_optimization.py | 2 +- tests/test_tokenization_albert.py | 5 +- tests/test_tokenization_bert.py | 5 +- tests/test_tokenization_bert_japanese.py | 7 +- tests/test_tokenization_common.py | 755 +++++++++--------- tests/test_tokenization_ctrl.py | 5 +- tests/test_tokenization_gpt2.py | 5 +- tests/test_tokenization_openai.py | 5 +- tests/test_tokenization_roberta.py | 5 +- tests/test_tokenization_t5.py | 5 +- tests/test_tokenization_transfo_xl.py | 5 +- tests/test_tokenization_xlm.py | 5 +- tests/test_tokenization_xlnet.py | 5 +- 16 files changed, 412 insertions(+), 411 deletions(-) diff --git a/templates/adding_a_new_model/tests/test_tokenization_xxx.py b/templates/adding_a_new_model/tests/test_tokenization_xxx.py index 2a7b58edb6..bbfe256db7 100644 --- a/templates/adding_a_new_model/tests/test_tokenization_xxx.py +++ b/templates/adding_a_new_model/tests/test_tokenization_xxx.py @@ -15,14 +15,15 @@ from __future__ import absolute_import, division, print_function, unicode_literals import os +import unittest from io import open from transformers.tokenization_bert import VOCAB_FILES_NAMES, XxxTokenizer -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin -class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester): +class XxxTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XxxTokenizer diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py index 16bf9ea572..39c8d6ce51 100644 --- a/tests/test_configuration_common.py +++ b/tests/test_configuration_common.py @@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function import json import os -from .test_tokenization_commo import TemporaryDirectory +from .test_tokenization_common import TemporaryDirectory class ConfigTester(object): diff --git a/tests/test_model_card.py b/tests/test_model_card.py index 8f4b1d24b7..d6cece5f15 100644 --- a/tests/test_model_card.py +++ b/tests/test_model_card.py @@ -20,7 +20,7 @@ import unittest from transformers.modelcard import ModelCard -from .test_tokenization_commo import TemporaryDirectory +from .test_tokenization_common import TemporaryDirectory class ModelCardTester(unittest.TestCase): diff --git a/tests/test_optimization.py b/tests/test_optimization.py index fa628be8ef..d6871420f4 100644 --- a/tests/test_optimization.py +++ b/tests/test_optimization.py @@ -19,7 +19,7 @@ import unittest from transformers import is_torch_available -from .test_tokenization_commo import TemporaryDirectory +from .test_tokenization_common import TemporaryDirectory from .utils import require_torch diff --git a/tests/test_tokenization_albert.py b/tests/test_tokenization_albert.py index 3a3c47537f..834cfef58c 100644 --- a/tests/test_tokenization_albert.py +++ b/tests/test_tokenization_albert.py @@ -15,16 +15,17 @@ from __future__ import absolute_import, division, print_function, unicode_literals import os +import unittest from transformers.tokenization_albert import AlbertTokenizer -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model") -class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester): +class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = AlbertTokenizer diff --git a/tests/test_tokenization_bert.py b/tests/test_tokenization_bert.py index 4b1cb5b9c9..6d64beb9b0 100644 --- a/tests/test_tokenization_bert.py +++ b/tests/test_tokenization_bert.py @@ -15,6 +15,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals import os +import unittest from io import open from transformers.tokenization_bert import ( @@ -27,11 +28,11 @@ from transformers.tokenization_bert import ( _is_whitespace, ) -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin from .utils import slow -class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): +class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BertTokenizer diff --git a/tests/test_tokenization_bert_japanese.py b/tests/test_tokenization_bert_japanese.py index 519cc199f8..f8947e2a66 100644 --- a/tests/test_tokenization_bert_japanese.py +++ b/tests/test_tokenization_bert_japanese.py @@ -15,6 +15,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals import os +import unittest from io import open from transformers.tokenization_bert import WordpieceTokenizer @@ -25,12 +26,12 @@ from transformers.tokenization_bert_japanese import ( MecabTokenizer, ) -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin from .utils import custom_tokenizers, slow @custom_tokenizers -class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester): +class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BertJapaneseTokenizer @@ -130,7 +131,7 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester): assert encoded_pair == [2] + text + [3] + text_2 + [3] -class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTester): +class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BertJapaneseTokenizer diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 79b4bf7810..60ae6b523d 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -18,7 +18,6 @@ import os import shutil import sys import tempfile -import unittest from io import open @@ -43,489 +42,479 @@ else: unicode = str -class CommonTestCases: - class CommonTokenizerTester(unittest.TestCase): +class TokenizerTesterMixin: - tokenizer_class = None + tokenizer_class = None - def setUp(self): - self.tmpdirname = tempfile.mkdtemp() + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() - def tearDown(self): - shutil.rmtree(self.tmpdirname) + def tearDown(self): + shutil.rmtree(self.tmpdirname) - def get_tokenizer(self, **kwargs): - raise NotImplementedError + def get_tokenizer(self, **kwargs): + raise NotImplementedError - def get_input_output_texts(self): - raise NotImplementedError + def get_input_output_texts(self): + raise NotImplementedError - def test_tokenizers_common_properties(self): - tokenizer = self.get_tokenizer() - attributes_list = [ - "bos_token", - "eos_token", - "unk_token", - "sep_token", - "pad_token", - "cls_token", - "mask_token", - ] - for attr in attributes_list: - self.assertTrue(hasattr(tokenizer, attr)) - self.assertTrue(hasattr(tokenizer, attr + "_id")) + def test_tokenizers_common_properties(self): + tokenizer = self.get_tokenizer() + attributes_list = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + ] + for attr in attributes_list: + self.assertTrue(hasattr(tokenizer, attr)) + self.assertTrue(hasattr(tokenizer, attr + "_id")) - self.assertTrue(hasattr(tokenizer, "additional_special_tokens")) - self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids")) + self.assertTrue(hasattr(tokenizer, "additional_special_tokens")) + self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids")) - attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"] - for attr in attributes_list: - self.assertTrue(hasattr(tokenizer, attr)) + attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"] + for attr in attributes_list: + self.assertTrue(hasattr(tokenizer, attr)) - def test_save_and_load_tokenizer(self): - # safety check on max_len default value so we are sure the test works - tokenizer = self.get_tokenizer() - self.assertNotEqual(tokenizer.max_len, 42) + def test_save_and_load_tokenizer(self): + # safety check on max_len default value so we are sure the test works + tokenizer = self.get_tokenizer() + self.assertNotEqual(tokenizer.max_len, 42) - # Now let's start the test - tokenizer = self.get_tokenizer(max_len=42) + # Now let's start the test + tokenizer = self.get_tokenizer(max_len=42) - before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False) + before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False) - with TemporaryDirectory() as tmpdirname: - tokenizer.save_pretrained(tmpdirname) - tokenizer = self.tokenizer_class.from_pretrained(tmpdirname) + with TemporaryDirectory() as tmpdirname: + tokenizer.save_pretrained(tmpdirname) + tokenizer = self.tokenizer_class.from_pretrained(tmpdirname) - after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False) - self.assertListEqual(before_tokens, after_tokens) + after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False) + self.assertListEqual(before_tokens, after_tokens) - self.assertEqual(tokenizer.max_len, 42) - tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43) - self.assertEqual(tokenizer.max_len, 43) + self.assertEqual(tokenizer.max_len, 42) + tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43) + self.assertEqual(tokenizer.max_len, 43) - def test_pickle_tokenizer(self): - tokenizer = self.get_tokenizer() - self.assertIsNotNone(tokenizer) + def test_pickle_tokenizer(self): + tokenizer = self.get_tokenizer() + self.assertIsNotNone(tokenizer) - text = "Munich and Berlin are nice cities" - subwords = tokenizer.tokenize(text) + text = "Munich and Berlin are nice cities" + subwords = tokenizer.tokenize(text) - with TemporaryDirectory() as tmpdirname: + with TemporaryDirectory() as tmpdirname: - filename = os.path.join(tmpdirname, "tokenizer.bin") - with open(filename, "wb") as handle: - pickle.dump(tokenizer, handle) + filename = os.path.join(tmpdirname, "tokenizer.bin") + with open(filename, "wb") as handle: + pickle.dump(tokenizer, handle) - with open(filename, "rb") as handle: - tokenizer_new = pickle.load(handle) + with open(filename, "rb") as handle: + tokenizer_new = pickle.load(handle) - subwords_loaded = tokenizer_new.tokenize(text) + subwords_loaded = tokenizer_new.tokenize(text) - self.assertListEqual(subwords, subwords_loaded) + self.assertListEqual(subwords, subwords_loaded) - def test_added_tokens_do_lower_case(self): - tokenizer = self.get_tokenizer(do_lower_case=True) + def test_added_tokens_do_lower_case(self): + tokenizer = self.get_tokenizer(do_lower_case=True) - special_token = tokenizer.all_special_tokens[0] + special_token = tokenizer.all_special_tokens[0] - text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token - text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token + text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token + text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token - toks0 = tokenizer.tokenize(text) # toks before adding new_toks + toks0 = tokenizer.tokenize(text) # toks before adding new_toks - new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"] - added = tokenizer.add_tokens(new_toks) - self.assertEqual(added, 2) + new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"] + added = tokenizer.add_tokens(new_toks) + self.assertEqual(added, 2) - toks = tokenizer.tokenize(text) - toks2 = tokenizer.tokenize(text2) + toks = tokenizer.tokenize(text) + toks2 = tokenizer.tokenize(text2) - self.assertEqual(len(toks), len(toks2)) - self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer - self.assertListEqual(toks, toks2) + self.assertEqual(len(toks), len(toks2)) + self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer + self.assertListEqual(toks, toks2) - # Check that none of the special tokens are lowercased - sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B" - tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens) + # Check that none of the special tokens are lowercased + sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B" + tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens) - for special_token in tokenizer.all_special_tokens: - self.assertTrue(special_token in tokenized_sequence) + for special_token in tokenizer.all_special_tokens: + self.assertTrue(special_token in tokenized_sequence) - tokenizer = self.get_tokenizer(do_lower_case=False) + tokenizer = self.get_tokenizer(do_lower_case=False) - added = tokenizer.add_tokens(new_toks) - self.assertEqual(added, 4) + added = tokenizer.add_tokens(new_toks) + self.assertEqual(added, 4) - toks = tokenizer.tokenize(text) - toks2 = tokenizer.tokenize(text2) + toks = tokenizer.tokenize(text) + toks2 = tokenizer.tokenize(text2) - self.assertEqual(len(toks), len(toks2)) # Length should still be the same - self.assertNotEqual(len(toks), len(toks0)) - self.assertNotEqual(toks[1], toks2[1]) # But at least the first non-special tokens should differ + self.assertEqual(len(toks), len(toks2)) # Length should still be the same + self.assertNotEqual(len(toks), len(toks0)) + self.assertNotEqual(toks[1], toks2[1]) # But at least the first non-special tokens should differ - def test_add_tokens_tokenizer(self): - tokenizer = self.get_tokenizer() + def test_add_tokens_tokenizer(self): + tokenizer = self.get_tokenizer() - vocab_size = tokenizer.vocab_size - all_size = len(tokenizer) + vocab_size = tokenizer.vocab_size + all_size = len(tokenizer) - self.assertNotEqual(vocab_size, 0) - self.assertEqual(vocab_size, all_size) + self.assertNotEqual(vocab_size, 0) + self.assertEqual(vocab_size, all_size) - new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"] - added_toks = tokenizer.add_tokens(new_toks) - vocab_size_2 = tokenizer.vocab_size - all_size_2 = len(tokenizer) + new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"] + added_toks = tokenizer.add_tokens(new_toks) + vocab_size_2 = tokenizer.vocab_size + all_size_2 = len(tokenizer) - self.assertNotEqual(vocab_size_2, 0) - self.assertEqual(vocab_size, vocab_size_2) - self.assertEqual(added_toks, len(new_toks)) - self.assertEqual(all_size_2, all_size + len(new_toks)) + self.assertNotEqual(vocab_size_2, 0) + self.assertEqual(vocab_size, vocab_size_2) + self.assertEqual(added_toks, len(new_toks)) + self.assertEqual(all_size_2, all_size + len(new_toks)) - tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False) - out_string = tokenizer.decode(tokens) + tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False) + out_string = tokenizer.decode(tokens) - self.assertGreaterEqual(len(tokens), 4) - self.assertGreater(tokens[0], tokenizer.vocab_size - 1) - self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) + self.assertGreaterEqual(len(tokens), 4) + self.assertGreater(tokens[0], tokenizer.vocab_size - 1) + self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) - new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"} - added_toks_2 = tokenizer.add_special_tokens(new_toks_2) - vocab_size_3 = tokenizer.vocab_size - all_size_3 = len(tokenizer) + new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"} + added_toks_2 = tokenizer.add_special_tokens(new_toks_2) + vocab_size_3 = tokenizer.vocab_size + all_size_3 = len(tokenizer) - self.assertNotEqual(vocab_size_3, 0) - self.assertEqual(vocab_size, vocab_size_3) - self.assertEqual(added_toks_2, len(new_toks_2)) - self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) + self.assertNotEqual(vocab_size_3, 0) + self.assertEqual(vocab_size, vocab_size_3) + self.assertEqual(added_toks_2, len(new_toks_2)) + self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) - tokens = tokenizer.encode( - ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False - ) - out_string = tokenizer.decode(tokens) + tokens = tokenizer.encode( + ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False + ) + out_string = tokenizer.decode(tokens) - self.assertGreaterEqual(len(tokens), 6) - self.assertGreater(tokens[0], tokenizer.vocab_size - 1) - self.assertGreater(tokens[0], tokens[1]) - self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) - self.assertGreater(tokens[-2], tokens[-3]) - self.assertEqual(tokens[0], tokenizer.eos_token_id) - self.assertEqual(tokens[-2], tokenizer.pad_token_id) + self.assertGreaterEqual(len(tokens), 6) + self.assertGreater(tokens[0], tokenizer.vocab_size - 1) + self.assertGreater(tokens[0], tokens[1]) + self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) + self.assertGreater(tokens[-2], tokens[-3]) + self.assertEqual(tokens[0], tokenizer.eos_token_id) + self.assertEqual(tokens[-2], tokenizer.pad_token_id) - def test_add_special_tokens(self): - tokenizer = self.get_tokenizer() - input_text, output_text = self.get_input_output_texts() + def test_add_special_tokens(self): + tokenizer = self.get_tokenizer() + input_text, output_text = self.get_input_output_texts() - special_token = "[SPECIAL TOKEN]" + special_token = "[SPECIAL TOKEN]" - tokenizer.add_special_tokens({"cls_token": special_token}) - encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False) - assert len(encoded_special_token) == 1 + tokenizer.add_special_tokens({"cls_token": special_token}) + encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False) + assert len(encoded_special_token) == 1 - text = " ".join([input_text, special_token, output_text]) - encoded = tokenizer.encode(text, add_special_tokens=False) + text = " ".join([input_text, special_token, output_text]) + encoded = tokenizer.encode(text, add_special_tokens=False) - input_encoded = tokenizer.encode(input_text, add_special_tokens=False) - output_encoded = tokenizer.encode(output_text, add_special_tokens=False) - special_token_id = tokenizer.encode(special_token, add_special_tokens=False) - assert encoded == input_encoded + special_token_id + output_encoded + input_encoded = tokenizer.encode(input_text, add_special_tokens=False) + output_encoded = tokenizer.encode(output_text, add_special_tokens=False) + special_token_id = tokenizer.encode(special_token, add_special_tokens=False) + assert encoded == input_encoded + special_token_id + output_encoded - decoded = tokenizer.decode(encoded, skip_special_tokens=True) - assert special_token not in decoded + decoded = tokenizer.decode(encoded, skip_special_tokens=True) + assert special_token not in decoded - def test_required_methods_tokenizer(self): - tokenizer = self.get_tokenizer() - input_text, output_text = self.get_input_output_texts() + def test_required_methods_tokenizer(self): + tokenizer = self.get_tokenizer() + input_text, output_text = self.get_input_output_texts() - tokens = tokenizer.tokenize(input_text) - ids = tokenizer.convert_tokens_to_ids(tokens) - ids_2 = tokenizer.encode(input_text, add_special_tokens=False) - self.assertListEqual(ids, ids_2) + tokens = tokenizer.tokenize(input_text) + ids = tokenizer.convert_tokens_to_ids(tokens) + ids_2 = tokenizer.encode(input_text, add_special_tokens=False) + self.assertListEqual(ids, ids_2) - tokens_2 = tokenizer.convert_ids_to_tokens(ids) - text_2 = tokenizer.decode(ids) + tokens_2 = tokenizer.convert_ids_to_tokens(ids) + text_2 = tokenizer.decode(ids) - self.assertEqual(text_2, output_text) + self.assertEqual(text_2, output_text) - self.assertNotEqual(len(tokens_2), 0) - self.assertIsInstance(text_2, (str, unicode)) + self.assertNotEqual(len(tokens_2), 0) + self.assertIsInstance(text_2, (str, unicode)) - def test_encode_decode_with_spaces(self): - tokenizer = self.get_tokenizer() + def test_encode_decode_with_spaces(self): + tokenizer = self.get_tokenizer() - new_toks = ["[ABC]", "[DEF]", "GHI IHG"] - tokenizer.add_tokens(new_toks) - input = "[ABC] [DEF] [ABC] GHI IHG [DEF]" - encoded = tokenizer.encode(input, add_special_tokens=False) - decoded = tokenizer.decode(encoded) - self.assertEqual(decoded, input) + new_toks = ["[ABC]", "[DEF]", "GHI IHG"] + tokenizer.add_tokens(new_toks) + input = "[ABC] [DEF] [ABC] GHI IHG [DEF]" + encoded = tokenizer.encode(input, add_special_tokens=False) + decoded = tokenizer.decode(encoded) + self.assertEqual(decoded, input) - def test_pretrained_model_lists(self): - weights_list = list(self.tokenizer_class.max_model_input_sizes.keys()) - weights_lists_2 = [] - for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items(): - weights_lists_2.append(list(map_list.keys())) + def test_pretrained_model_lists(self): + weights_list = list(self.tokenizer_class.max_model_input_sizes.keys()) + weights_lists_2 = [] + for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items(): + weights_lists_2.append(list(map_list.keys())) - for weights_list_2 in weights_lists_2: - self.assertListEqual(weights_list, weights_list_2) + for weights_list_2 in weights_lists_2: + self.assertListEqual(weights_list, weights_list_2) - def test_mask_output(self): - if sys.version_info <= (3, 0): - return - - tokenizer = self.get_tokenizer() - - if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer": - seq_0 = "Test this method." - seq_1 = "With these inputs." - information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True) - sequences, mask = information["input_ids"], information["token_type_ids"] - self.assertEqual(len(sequences), len(mask)) - - def test_number_of_added_tokens(self): - tokenizer = self.get_tokenizer() + def test_mask_output(self): + if sys.version_info <= (3, 0): + return + + tokenizer = self.get_tokenizer() + if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer": seq_0 = "Test this method." seq_1 = "With these inputs." + information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True) + sequences, mask = information["input_ids"], information["token_type_ids"] + self.assertEqual(len(sequences), len(mask)) - sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False) - attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) + def test_number_of_added_tokens(self): + tokenizer = self.get_tokenizer() - # Method is implemented (e.g. not GPT-2) - if len(attached_sequences) != 2: - self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences)) + seq_0 = "Test this method." + seq_1 = "With these inputs." - def test_maximum_encoding_length_single_input(self): - tokenizer = self.get_tokenizer() + sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False) + attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) - seq_0 = "This is a sentence to be encoded." - stride = 2 + # Method is implemented (e.g. not GPT-2) + if len(attached_sequences) != 2: + self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences)) - sequence = tokenizer.encode(seq_0, add_special_tokens=False) - num_added_tokens = tokenizer.num_added_tokens() - total_length = len(sequence) + num_added_tokens - information = tokenizer.encode_plus( - seq_0, - max_length=total_length - 2, - add_special_tokens=True, - stride=stride, - return_overflowing_tokens=True, - ) + def test_maximum_encoding_length_single_input(self): + tokenizer = self.get_tokenizer() - truncated_sequence = information["input_ids"] - overflowing_tokens = information["overflowing_tokens"] + seq_0 = "This is a sentence to be encoded." + stride = 2 - self.assertEqual(len(overflowing_tokens), 2 + stride) - self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :]) - self.assertEqual(len(truncated_sequence), total_length - 2) - self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2])) + sequence = tokenizer.encode(seq_0, add_special_tokens=False) + num_added_tokens = tokenizer.num_added_tokens() + total_length = len(sequence) + num_added_tokens + information = tokenizer.encode_plus( + seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride, return_overflowing_tokens=True, + ) - def test_maximum_encoding_length_pair_input(self): - tokenizer = self.get_tokenizer() + truncated_sequence = information["input_ids"] + overflowing_tokens = information["overflowing_tokens"] - seq_0 = "This is a sentence to be encoded." - seq_1 = "This is another sentence to be encoded." - stride = 2 + self.assertEqual(len(overflowing_tokens), 2 + stride) + self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :]) + self.assertEqual(len(truncated_sequence), total_length - 2) + self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2])) - sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False) - sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False) + def test_maximum_encoding_length_pair_input(self): + tokenizer = self.get_tokenizer() - sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) - truncated_second_sequence = tokenizer.build_inputs_with_special_tokens( - tokenizer.encode(seq_0, add_special_tokens=False), - tokenizer.encode(seq_1, add_special_tokens=False)[:-2], - ) + seq_0 = "This is a sentence to be encoded." + seq_1 = "This is another sentence to be encoded." + stride = 2 - information = tokenizer.encode_plus( - seq_0, - seq_1, - max_length=len(sequence) - 2, - add_special_tokens=True, - stride=stride, - truncation_strategy="only_second", - return_overflowing_tokens=True, - ) - information_first_truncated = tokenizer.encode_plus( - seq_0, - seq_1, - max_length=len(sequence) - 2, - add_special_tokens=True, - stride=stride, - truncation_strategy="only_first", - return_overflowing_tokens=True, - ) + sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False) + sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False) - truncated_sequence = information["input_ids"] - overflowing_tokens = information["overflowing_tokens"] - overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"] + sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) + truncated_second_sequence = tokenizer.build_inputs_with_special_tokens( + tokenizer.encode(seq_0, add_special_tokens=False), tokenizer.encode(seq_1, add_special_tokens=False)[:-2], + ) - self.assertEqual(len(overflowing_tokens), 2 + stride) - self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :]) - self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :]) - self.assertEqual(len(truncated_sequence), len(sequence) - 2) - self.assertEqual(truncated_sequence, truncated_second_sequence) + information = tokenizer.encode_plus( + seq_0, + seq_1, + max_length=len(sequence) - 2, + add_special_tokens=True, + stride=stride, + truncation_strategy="only_second", + return_overflowing_tokens=True, + ) + information_first_truncated = tokenizer.encode_plus( + seq_0, + seq_1, + max_length=len(sequence) - 2, + add_special_tokens=True, + stride=stride, + truncation_strategy="only_first", + return_overflowing_tokens=True, + ) - def test_encode_input_type(self): - tokenizer = self.get_tokenizer() + truncated_sequence = information["input_ids"] + overflowing_tokens = information["overflowing_tokens"] + overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"] - sequence = "Let's encode this sequence" + self.assertEqual(len(overflowing_tokens), 2 + stride) + self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :]) + self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :]) + self.assertEqual(len(truncated_sequence), len(sequence) - 2) + self.assertEqual(truncated_sequence, truncated_second_sequence) - tokens = tokenizer.tokenize(sequence) - input_ids = tokenizer.convert_tokens_to_ids(tokens) - formatted_input = tokenizer.encode(sequence, add_special_tokens=True) + def test_encode_input_type(self): + tokenizer = self.get_tokenizer() - self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input) - self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input) + sequence = "Let's encode this sequence" - def test_special_tokens_mask(self): - tokenizer = self.get_tokenizer() + tokens = tokenizer.tokenize(sequence) + input_ids = tokenizer.convert_tokens_to_ids(tokens) + formatted_input = tokenizer.encode(sequence, add_special_tokens=True) - sequence_0 = "Encode this." - sequence_1 = "This one too please." + self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input) + self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input) - # Testing single inputs - encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) - encoded_sequence_dict = tokenizer.encode_plus( - sequence_0, add_special_tokens=True, return_special_tokens_mask=True - ) - encoded_sequence_w_special = encoded_sequence_dict["input_ids"] - special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] - self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + def test_special_tokens_mask(self): + tokenizer = self.get_tokenizer() - filtered_sequence = [ - (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) - ] - filtered_sequence = [x for x in filtered_sequence if x is not None] - self.assertEqual(encoded_sequence, filtered_sequence) + sequence_0 = "Encode this." + sequence_1 = "This one too please." - # Testing inputs pairs - encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode( - sequence_1, add_special_tokens=False - ) - encoded_sequence_dict = tokenizer.encode_plus( - sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True - ) - encoded_sequence_w_special = encoded_sequence_dict["input_ids"] - special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] - self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + # Testing single inputs + encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + encoded_sequence_dict = tokenizer.encode_plus( + sequence_0, add_special_tokens=True, return_special_tokens_mask=True + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) - filtered_sequence = [ - (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) - ] - filtered_sequence = [x for x in filtered_sequence if x is not None] - self.assertEqual(encoded_sequence, filtered_sequence) + filtered_sequence = [ + (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) + ] + filtered_sequence = [x for x in filtered_sequence if x is not None] + self.assertEqual(encoded_sequence, filtered_sequence) - # Testing with already existing special tokens - if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id: - tokenizer.add_special_tokens({"cls_token": "", "sep_token": ""}) - encoded_sequence_dict = tokenizer.encode_plus( - sequence_0, add_special_tokens=True, return_special_tokens_mask=True - ) - encoded_sequence_w_special = encoded_sequence_dict["input_ids"] - special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"] - special_tokens_mask = tokenizer.get_special_tokens_mask( - encoded_sequence_w_special, already_has_special_tokens=True - ) - self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) - self.assertEqual(special_tokens_mask_orig, special_tokens_mask) + # Testing inputs pairs + encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode( + sequence_1, add_special_tokens=False + ) + encoded_sequence_dict = tokenizer.encode_plus( + sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) - def test_padding_to_max_length(self): - tokenizer = self.get_tokenizer() + filtered_sequence = [ + (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) + ] + filtered_sequence = [x for x in filtered_sequence if x is not None] + self.assertEqual(encoded_sequence, filtered_sequence) - sequence = "Sequence" - padding_size = 10 - padding_idx = tokenizer.pad_token_id + # Testing with already existing special tokens + if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id: + tokenizer.add_special_tokens({"cls_token": "", "sep_token": ""}) + encoded_sequence_dict = tokenizer.encode_plus( + sequence_0, add_special_tokens=True, return_special_tokens_mask=True + ) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"] + special_tokens_mask = tokenizer.get_special_tokens_mask( + encoded_sequence_w_special, already_has_special_tokens=True + ) + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + self.assertEqual(special_tokens_mask_orig, special_tokens_mask) - # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True - tokenizer.padding_side = "right" - encoded_sequence = tokenizer.encode(sequence) - sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode( - sequence, max_length=sequence_length + padding_size, pad_to_max_length=True - ) - padded_sequence_length = len(padded_sequence) - assert sequence_length + padding_size == padded_sequence_length - assert encoded_sequence + [padding_idx] * padding_size == padded_sequence + def test_padding_to_max_length(self): + tokenizer = self.get_tokenizer() - # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True - tokenizer.padding_side = "left" - encoded_sequence = tokenizer.encode(sequence) - sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode( - sequence, max_length=sequence_length + padding_size, pad_to_max_length=True - ) - padded_sequence_length = len(padded_sequence) - assert sequence_length + padding_size == padded_sequence_length - assert [padding_idx] * padding_size + encoded_sequence == padded_sequence + sequence = "Sequence" + padding_size = 10 + padding_idx = tokenizer.pad_token_id - # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified - encoded_sequence = tokenizer.encode(sequence) - sequence_length = len(encoded_sequence) + # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "right" + encoded_sequence = tokenizer.encode(sequence) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert encoded_sequence + [padding_idx] * padding_size == padded_sequence - tokenizer.padding_side = "right" - padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True) - padded_sequence_right_length = len(padded_sequence_right) + # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "left" + encoded_sequence = tokenizer.encode(sequence) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert [padding_idx] * padding_size + encoded_sequence == padded_sequence - tokenizer.padding_side = "left" - padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True) - padded_sequence_left_length = len(padded_sequence_left) + # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified + encoded_sequence = tokenizer.encode(sequence) + sequence_length = len(encoded_sequence) - assert sequence_length == padded_sequence_right_length - assert encoded_sequence == padded_sequence_right - assert sequence_length == padded_sequence_left_length - assert encoded_sequence == padded_sequence_left + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True) + padded_sequence_right_length = len(padded_sequence_right) - def test_encode_plus_with_padding(self): - tokenizer = self.get_tokenizer() + tokenizer.padding_side = "left" + padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True) + padded_sequence_left_length = len(padded_sequence_left) - sequence = "Sequence" - padding_size = 10 - padding_idx = tokenizer.pad_token_id - token_type_padding_idx = tokenizer.pad_token_type_id + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + assert sequence_length == padded_sequence_left_length + assert encoded_sequence == padded_sequence_left - encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True) - input_ids = encoded_sequence["input_ids"] - token_type_ids = encoded_sequence["token_type_ids"] - attention_mask = encoded_sequence["attention_mask"] - special_tokens_mask = encoded_sequence["special_tokens_mask"] - sequence_length = len(input_ids) + def test_encode_plus_with_padding(self): + tokenizer = self.get_tokenizer() - # Test right padding - tokenizer.padding_side = "right" - padded_sequence = tokenizer.encode_plus( - sequence, - max_length=sequence_length + padding_size, - pad_to_max_length=True, - return_special_tokens_mask=True, - ) - padded_input_ids = padded_sequence["input_ids"] - padded_token_type_ids = padded_sequence["token_type_ids"] - padded_attention_mask = padded_sequence["attention_mask"] - padded_special_tokens_mask = padded_sequence["special_tokens_mask"] - padded_sequence_length = len(padded_input_ids) + sequence = "Sequence" + padding_size = 10 + padding_idx = tokenizer.pad_token_id + token_type_padding_idx = tokenizer.pad_token_type_id - assert sequence_length + padding_size == padded_sequence_length - assert input_ids + [padding_idx] * padding_size == padded_input_ids - assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids - assert attention_mask + [0] * padding_size == padded_attention_mask - assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask + encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True) + input_ids = encoded_sequence["input_ids"] + token_type_ids = encoded_sequence["token_type_ids"] + attention_mask = encoded_sequence["attention_mask"] + special_tokens_mask = encoded_sequence["special_tokens_mask"] + sequence_length = len(input_ids) - # Test left padding - tokenizer.padding_side = "left" - padded_sequence = tokenizer.encode_plus( - sequence, - max_length=sequence_length + padding_size, - pad_to_max_length=True, - return_special_tokens_mask=True, - ) - padded_input_ids = padded_sequence["input_ids"] - padded_token_type_ids = padded_sequence["token_type_ids"] - padded_attention_mask = padded_sequence["attention_mask"] - padded_special_tokens_mask = padded_sequence["special_tokens_mask"] - padded_sequence_length = len(padded_input_ids) + # Test right padding + tokenizer.padding_side = "right" + padded_sequence = tokenizer.encode_plus( + sequence, + max_length=sequence_length + padding_size, + pad_to_max_length=True, + return_special_tokens_mask=True, + ) + padded_input_ids = padded_sequence["input_ids"] + padded_token_type_ids = padded_sequence["token_type_ids"] + padded_attention_mask = padded_sequence["attention_mask"] + padded_special_tokens_mask = padded_sequence["special_tokens_mask"] + padded_sequence_length = len(padded_input_ids) - assert sequence_length + padding_size == padded_sequence_length - assert [padding_idx] * padding_size + input_ids == padded_input_ids - assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids - assert [0] * padding_size + attention_mask == padded_attention_mask - assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask + assert sequence_length + padding_size == padded_sequence_length + assert input_ids + [padding_idx] * padding_size == padded_input_ids + assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids + assert attention_mask + [0] * padding_size == padded_attention_mask + assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask + + # Test left padding + tokenizer.padding_side = "left" + padded_sequence = tokenizer.encode_plus( + sequence, + max_length=sequence_length + padding_size, + pad_to_max_length=True, + return_special_tokens_mask=True, + ) + padded_input_ids = padded_sequence["input_ids"] + padded_token_type_ids = padded_sequence["token_type_ids"] + padded_attention_mask = padded_sequence["attention_mask"] + padded_special_tokens_mask = padded_sequence["special_tokens_mask"] + padded_sequence_length = len(padded_input_ids) + + assert sequence_length + padding_size == padded_sequence_length + assert [padding_idx] * padding_size + input_ids == padded_input_ids + assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids + assert [0] * padding_size + attention_mask == padded_attention_mask + assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask diff --git a/tests/test_tokenization_ctrl.py b/tests/test_tokenization_ctrl.py index 77ff6a86ea..89d9f78024 100644 --- a/tests/test_tokenization_ctrl.py +++ b/tests/test_tokenization_ctrl.py @@ -15,14 +15,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera import json import os +import unittest from io import open from transformers.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin -class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester): +class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = CTRLTokenizer diff --git a/tests/test_tokenization_gpt2.py b/tests/test_tokenization_gpt2.py index fbc45738f0..3dea20d8c5 100644 --- a/tests/test_tokenization_gpt2.py +++ b/tests/test_tokenization_gpt2.py @@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera import json import os +import unittest from io import open from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES, GPT2Tokenizer -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin -class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): +class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = GPT2Tokenizer diff --git a/tests/test_tokenization_openai.py b/tests/test_tokenization_openai.py index a9e8cc38ea..fed01c9efe 100644 --- a/tests/test_tokenization_openai.py +++ b/tests/test_tokenization_openai.py @@ -16,13 +16,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera import json import os +import unittest from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin -class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester): +class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = OpenAIGPTTokenizer diff --git a/tests/test_tokenization_roberta.py b/tests/test_tokenization_roberta.py index bacfd51555..da60063356 100644 --- a/tests/test_tokenization_roberta.py +++ b/tests/test_tokenization_roberta.py @@ -16,15 +16,16 @@ from __future__ import absolute_import, division, print_function, unicode_litera import json import os +import unittest from io import open from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin from .utils import slow -class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): +class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = RobertaTokenizer def setUp(self): diff --git a/tests/test_tokenization_t5.py b/tests/test_tokenization_t5.py index 68ceebb83c..62a1d148a7 100644 --- a/tests/test_tokenization_t5.py +++ b/tests/test_tokenization_t5.py @@ -15,17 +15,18 @@ from __future__ import absolute_import, division, print_function, unicode_literals import os +import unittest from transformers.tokenization_t5 import T5Tokenizer from transformers.tokenization_xlnet import SPIECE_UNDERLINE -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") -class T5TokenizationTest(CommonTestCases.CommonTokenizerTester): +class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = T5Tokenizer diff --git a/tests/test_tokenization_transfo_xl.py b/tests/test_tokenization_transfo_xl.py index 1d275f591f..cd6c7c58f9 100644 --- a/tests/test_tokenization_transfo_xl.py +++ b/tests/test_tokenization_transfo_xl.py @@ -15,11 +15,12 @@ from __future__ import absolute_import, division, print_function, unicode_literals import os +import unittest from io import open from transformers import is_torch_available -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin from .utils import require_torch @@ -28,7 +29,7 @@ if is_torch_available(): @require_torch -class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester): +class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = TransfoXLTokenizer if is_torch_available() else None diff --git a/tests/test_tokenization_xlm.py b/tests/test_tokenization_xlm.py index 12bff7f618..9f9dd8fbc4 100644 --- a/tests/test_tokenization_xlm.py +++ b/tests/test_tokenization_xlm.py @@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera import json import os +import unittest from transformers.tokenization_xlm import VOCAB_FILES_NAMES, XLMTokenizer -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin from .utils import slow -class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): +class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLMTokenizer diff --git a/tests/test_tokenization_xlnet.py b/tests/test_tokenization_xlnet.py index ed6430959e..dcc270e683 100644 --- a/tests/test_tokenization_xlnet.py +++ b/tests/test_tokenization_xlnet.py @@ -15,17 +15,18 @@ from __future__ import absolute_import, division, print_function, unicode_literals import os +import unittest from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer -from .test_tokenization_commo import CommonTestCases +from .test_tokenization_common import TokenizerTesterMixin from .utils import slow SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") -class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester): +class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLNetTokenizer