Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove Transfo-XL fast tokenizer (#7141)
* [WIP] SP tokenizers * fixing tests for T5 * WIP tokenizers * serialization * update T5 * WIP T5 tokenization * slow to fast conversion script * Refactoring to move tokenzier implementations inside transformers * Adding gpt - refactoring - quality * WIP adding several tokenizers to the fast world * WIP Roberta - moving implementations * update to dev4 switch file loading to in-memory loading * Updating and fixing * advancing on the tokenizers - updating do_lower_case * style and quality * moving forward with tokenizers conversion and tests * MBart, T5 * dumping the fast version of transformer XL * Adding to autotokenizers + style/quality * update init and space_between_special_tokens * style and quality * bump up tokenizers version * add protobuf * fix pickle Bert JP with Mecab * fix newly added tokenizers * style and quality * fix bert japanese * fix funnel * limite tokenizer warning to one occurence * clean up file * fix new tokenizers * fast tokenizers deep tests * WIP adding all the special fast tests on the new fast tokenizers * quick fix * adding more fast tokenizers in the fast tests * all tokenizers in fast version tested * Adding BertGenerationFast * bump up setup.py for CI * remove BertGenerationFast (too early) * bump up tokenizers version * Clean old docstrings * Typo * Update following Lysandre comments Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
This commit is contained in:
@@ -17,7 +17,7 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from transformers.tokenization_albert import AlbertTokenizer
|
||||
from transformers.tokenization_albert import AlbertTokenizer, AlbertTokenizerFast
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
@@ -28,6 +28,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
|
||||
class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = AlbertTokenizer
|
||||
rust_tokenizer_class = AlbertTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
@@ -41,6 +43,28 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
output_text = "this is a test"
|
||||
return input_text, output_text
|
||||
|
||||
def test_rust_and_python_full_tokenizers(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
|
||||
sequence = "I was born in 92000, and this is falsé."
|
||||
|
||||
tokens = tokenizer.tokenize(sequence)
|
||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||
self.assertListEqual(tokens, rust_tokens)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
ids = tokenizer.encode(sequence)
|
||||
rust_ids = rust_tokenizer.encode(sequence)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||
|
||||
|
||||
@@ -12,6 +12,8 @@ from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer_class = BartTokenizer
|
||||
rust_tokenizer_class = BartTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
@@ -35,7 +35,9 @@ from .test_tokenization_common import TokenizerTesterMixin
|
||||
class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = BertTokenizer
|
||||
rust_tokenizer_class = BertTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
space_between_special_tokens = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
@@ -61,9 +63,6 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
|
||||
def get_rust_tokenizer(self, **kwargs):
|
||||
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self, tokenizer):
|
||||
input_text = "UNwant\u00E9d,running"
|
||||
output_text = "unwanted, running"
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import unittest
|
||||
|
||||
from transformers.testing_utils import custom_tokenizers
|
||||
@@ -33,6 +34,7 @@ from .test_tokenization_common import TokenizerTesterMixin
|
||||
class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = BertJapaneseTokenizer
|
||||
space_between_special_tokens = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
@@ -87,6 +89,26 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
|
||||
|
||||
def test_pickle_mecab_tokenizer(self):
|
||||
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="mecab")
|
||||
self.assertIsNotNone(tokenizer)
|
||||
|
||||
text = "こんにちは、世界。\nこんばんは、世界。"
|
||||
tokens = tokenizer.tokenize(text)
|
||||
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
|
||||
|
||||
filename = os.path.join(self.tmpdirname, "tokenizer.bin")
|
||||
with open(filename, "wb") as handle:
|
||||
pickle.dump(tokenizer, handle)
|
||||
|
||||
with open(filename, "rb") as handle:
|
||||
tokenizer_new = pickle.load(handle)
|
||||
|
||||
tokens_loaded = tokenizer_new.tokenize(text)
|
||||
|
||||
self.assertListEqual(tokens, tokens_loaded)
|
||||
|
||||
def test_mecab_tokenizer_ipadic(self):
|
||||
tokenizer = MecabTokenizer(mecab_dic="ipadic")
|
||||
|
||||
|
||||
64
tests/test_tokenization_camembert.py
Normal file
64
tests/test_tokenization_camembert.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from transformers.testing_utils import _torch_available
|
||||
from transformers.tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
|
||||
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
|
||||
|
||||
FRAMEWORK = "pt" if _torch_available else "tf"
|
||||
|
||||
|
||||
class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = CamembertTokenizer
|
||||
rust_tokenizer_class = CamembertTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
# We have a SentencePiece fixture for testing
|
||||
tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
|
||||
def test_rust_and_python_full_tokenizers(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
|
||||
sequence = "I was born in 92000, and this is falsé."
|
||||
|
||||
tokens = tokenizer.tokenize(sequence)
|
||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||
self.assertListEqual(tokens, rust_tokens)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
ids = tokenizer.encode(sequence)
|
||||
rust_ids = rust_tokenizer.encode(sequence)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
@@ -56,7 +56,9 @@ def merge_model_tokenizer_mappings(
|
||||
class TokenizerTesterMixin:
|
||||
|
||||
tokenizer_class = None
|
||||
rust_tokenizer_class = None
|
||||
test_rust_tokenizer = False
|
||||
space_between_special_tokens = False
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
@@ -68,12 +70,15 @@ class TokenizerTesterMixin:
|
||||
input_txt = self.get_clean_sequence(tokenizer)[0]
|
||||
return input_txt, input_txt
|
||||
|
||||
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20) -> Tuple[str, list]:
|
||||
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
|
||||
toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
|
||||
toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
|
||||
toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
|
||||
if max_length is not None and len(toks) > max_length:
|
||||
toks = toks[:max_length]
|
||||
if min_length is not None and len(toks) < min_length and len(toks) > 0:
|
||||
while len(toks) < min_length:
|
||||
toks = toks + toks
|
||||
# toks_str = [t[1] for t in toks]
|
||||
toks_ids = [t[0] for t in toks]
|
||||
|
||||
@@ -99,7 +104,7 @@ class TokenizerTesterMixin:
|
||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
|
||||
raise NotImplementedError
|
||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
# def get_input_output_texts(self) -> Tuple[str, str]:
|
||||
# """Feel free to overwrite"""
|
||||
@@ -118,6 +123,29 @@ class TokenizerTesterMixin:
|
||||
for i in range(len(batch_encode_plus_sequences["input_ids"]))
|
||||
]
|
||||
|
||||
def test_rust_and_python_full_tokenizers(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
|
||||
sequence, _ = self.get_input_output_texts(tokenizer)
|
||||
|
||||
# We don't have an exact equivalence on `tokenize()` between Rust and Slow
|
||||
# Slow tokenizer only split tokens, Rust tokenizers will replace with <unk>
|
||||
# tokens = tokenizer.tokenize(sequence)
|
||||
# rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||
# self.assertListEqual(tokens, rust_tokens)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=True)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=True)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
def test_tokenizers_common_properties(self):
|
||||
tokenizers = self.get_tokenizers()
|
||||
for tokenizer in tokenizers:
|
||||
@@ -241,6 +269,9 @@ class TokenizerTesterMixin:
|
||||
tokenizers = self.get_tokenizers(fast=False, do_lower_case=True)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case:
|
||||
continue
|
||||
|
||||
special_token = tokenizer.all_special_tokens[0]
|
||||
|
||||
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
|
||||
@@ -272,6 +303,9 @@ class TokenizerTesterMixin:
|
||||
tokenizers = self.get_tokenizers(fast=False, do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
if hasattr(tokenizer, "do_lower_case") and tokenizer.do_lower_case:
|
||||
continue
|
||||
|
||||
special_token = tokenizer.all_special_tokens[0]
|
||||
|
||||
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
|
||||
@@ -282,7 +316,7 @@ class TokenizerTesterMixin:
|
||||
toks0 = tokenizer.tokenize(text) # toks before adding new_toks
|
||||
|
||||
added = tokenizer.add_tokens(new_toks)
|
||||
self.assertEqual(added, 4)
|
||||
self.assertIn(added, [2, 4])
|
||||
|
||||
toks = tokenizer.tokenize(text)
|
||||
toks2 = tokenizer.tokenize(text2)
|
||||
@@ -390,12 +424,17 @@ class TokenizerTesterMixin:
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
|
||||
new_toks = ["[ABC]", "[DEF]"] # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"]
|
||||
# new_toks = ["[ABC]", "[DEF]"] # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"]
|
||||
new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
|
||||
tokenizer.add_tokens(new_toks)
|
||||
input = "[ABC] [DEF] [ABC] [DEF]" # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]"
|
||||
input = "[ABC][DEF][ABC][DEF]" # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]"
|
||||
if self.space_between_special_tokens:
|
||||
output = "[ABC] [DEF] [ABC] [DEF]"
|
||||
else:
|
||||
output = input
|
||||
encoded = tokenizer.encode(input, add_special_tokens=False)
|
||||
decoded = tokenizer.decode(encoded)
|
||||
self.assertEqual(decoded, input)
|
||||
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
|
||||
self.assertIn(decoded, [output, output.lower()])
|
||||
|
||||
def test_pretrained_model_lists(self):
|
||||
weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
|
||||
@@ -447,7 +486,7 @@ class TokenizerTesterMixin:
|
||||
sequence = tokenizer.encode(seq_0, add_special_tokens=False)
|
||||
total_length = len(sequence)
|
||||
|
||||
assert total_length > 1, "Issue with the testing sequence, please update it it's too short"
|
||||
assert total_length > 4, "Issue with the testing sequence, please update it it's too short"
|
||||
|
||||
# Test with max model input length
|
||||
model_max_length = tokenizer.model_max_length
|
||||
@@ -546,6 +585,7 @@ class TokenizerTesterMixin:
|
||||
model_max_length = tokenizer.model_max_length
|
||||
self.assertEqual(model_max_length, 100)
|
||||
seq_2 = seq_0 * model_max_length
|
||||
assert len(seq_2) > model_max_length
|
||||
|
||||
sequence1 = tokenizer(seq_1, add_special_tokens=False)
|
||||
total_length1 = len(sequence1["input_ids"])
|
||||
@@ -559,9 +599,9 @@ class TokenizerTesterMixin:
|
||||
[False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
|
||||
)
|
||||
for padding_state in padding_strategies:
|
||||
with self.subTest(f"Padding: {padding_state}"):
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} Padding: {padding_state}"):
|
||||
for truncation_state in [True, "longest_first", "only_first"]:
|
||||
with self.subTest(f"Truncation: {truncation_state}"):
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} Truncation: {truncation_state}"):
|
||||
output = tokenizer(seq_2, seq_1, padding=padding_state, truncation=truncation_state)
|
||||
self.assertEqual(len(output["input_ids"]), model_max_length)
|
||||
|
||||
@@ -748,34 +788,47 @@ class TokenizerTesterMixin:
|
||||
# # This is not supported with the Rust tokenizers
|
||||
# # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
|
||||
|
||||
def test_swap_special_token(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
mask = "<mask>"
|
||||
sequence = "Encode this sequence"
|
||||
sequence_masked_0 = "Encode <mask> sequence"
|
||||
sequence_masked_1 = "<mask> this sequence"
|
||||
# def test_swap_special_token(self):
|
||||
# tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
# for tokenizer in tokenizers:
|
||||
# with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
# # Our mask token
|
||||
# mask = "<mask>"
|
||||
# # We take a single word in the middle of the vocabulary
|
||||
# all_tokens = sorted(tokenizer.get_vocab().keys())
|
||||
# word = tokenizer.decode(tokenizer.encode(all_tokens[len(all_tokens)//2], add_special_tokens=False)[:1])
|
||||
|
||||
# Add tokens so that masked token isn't split
|
||||
tokenizer.add_tokens(sequence.split())
|
||||
tokenizer.add_special_tokens({"mask_token": mask})
|
||||
mask_ind = tokenizer.convert_tokens_to_ids(mask)
|
||||
encoded = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
# sequence_0 = "Encode " + word + " sequence"
|
||||
# sequence_masked_0 = "Encode " + mask + " sequence"
|
||||
|
||||
# Test first masked sequence
|
||||
encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
|
||||
mask_loc = encoded_masked.index(mask_ind)
|
||||
encoded_masked[mask_loc] = encoded[mask_loc]
|
||||
# sequence_1 = word + " this sequence"
|
||||
# sequence_masked_1 = mask + " this sequence"
|
||||
|
||||
self.assertEqual(encoded_masked, encoded)
|
||||
# # Add tokens so that masked token isn't split
|
||||
# # tokens = [AddedToken(t, lstrip=True, normalized=False) for t in sequence.split()]
|
||||
# # tokenizer.add_tokens(tokens)
|
||||
# tokenizer.add_special_tokens(
|
||||
# {"mask_token": AddedToken(mask, normalized=False)}
|
||||
# ) # Eat left space on Byte-level BPE tokenizers
|
||||
# mask_ind = tokenizer.convert_tokens_to_ids(mask)
|
||||
|
||||
# Test second masked sequence
|
||||
encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
|
||||
mask_loc = encoded_masked.index(mask_ind)
|
||||
encoded_masked[mask_loc] = encoded[mask_loc]
|
||||
# # Test first masked sequence
|
||||
# encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False)
|
||||
# encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
|
||||
# assert len(encoded_masked) == len(encoded_0)
|
||||
# mask_loc = encoded_masked.index(mask_ind)
|
||||
# encoded_masked[mask_loc] = encoded_0[mask_loc]
|
||||
|
||||
self.assertEqual(encoded_masked, encoded)
|
||||
# self.assertEqual(encoded_masked, encoded_0)
|
||||
|
||||
# # Test second masked sequence
|
||||
# encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False)
|
||||
# encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
|
||||
# assert len(encoded_masked) == len(encoded_1)
|
||||
# mask_loc = encoded_masked.index(mask_ind)
|
||||
# encoded_masked[mask_loc] = encoded_1[mask_loc]
|
||||
|
||||
# self.assertEqual(encoded_masked, encoded_1)
|
||||
|
||||
def test_special_tokens_mask(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
@@ -919,10 +972,10 @@ class TokenizerTesterMixin:
|
||||
def test_padding_to_multiple_of(self):
|
||||
tokenizers = self.get_tokenizers()
|
||||
for tokenizer in tokenizers:
|
||||
if tokenizer.pad_token is None:
|
||||
self.skipTest("No padding token.")
|
||||
else:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
if tokenizer.pad_token is None:
|
||||
self.skipTest("No padding token.")
|
||||
else:
|
||||
empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
|
||||
normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
|
||||
for key, value in empty_tokens.items():
|
||||
@@ -1063,14 +1116,15 @@ class TokenizerTesterMixin:
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
vocab = tokenizer.get_vocab()
|
||||
vocab_dict = tokenizer.get_vocab()
|
||||
self.assertIsInstance(vocab_dict, dict)
|
||||
self.assertGreaterEqual(len(tokenizer), len(vocab_dict))
|
||||
|
||||
self.assertIsInstance(vocab, dict)
|
||||
vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
|
||||
self.assertEqual(len(vocab), len(tokenizer))
|
||||
|
||||
tokenizer.add_tokens(["asdfasdfasdfasdf"])
|
||||
vocab = tokenizer.get_vocab()
|
||||
self.assertIsInstance(vocab, dict)
|
||||
vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
|
||||
self.assertEqual(len(vocab), len(tokenizer))
|
||||
|
||||
def test_conversion_reversible(self):
|
||||
@@ -1079,6 +1133,8 @@ class TokenizerTesterMixin:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
vocab = tokenizer.get_vocab()
|
||||
for word, ind in vocab.items():
|
||||
if word == tokenizer.unk_token:
|
||||
continue
|
||||
self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind)
|
||||
self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word)
|
||||
|
||||
@@ -1173,12 +1229,13 @@ class TokenizerTesterMixin:
|
||||
def test_added_token_serializable(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
new_token = AddedToken("new_token", lstrip=True)
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
new_token = AddedToken("new_token", lstrip=True)
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_name:
|
||||
tokenizer.save_pretrained(tmp_dir_name)
|
||||
tokenizer.from_pretrained(tmp_dir_name)
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_name:
|
||||
tokenizer.save_pretrained(tmp_dir_name)
|
||||
tokenizer.from_pretrained(tmp_dir_name)
|
||||
|
||||
def test_batch_encode_plus_padding(self):
|
||||
# Test that padded sequences are equivalent between batch_encode_plus and encode_plus
|
||||
@@ -1243,6 +1300,9 @@ class TokenizerTesterMixin:
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
|
||||
if hasattr(tokenizer, "add_prefix_space") and not tokenizer.add_prefix_space:
|
||||
continue
|
||||
|
||||
# Prepare a sequence from our tokenizer vocabulary
|
||||
sequence, ids = self.get_clean_sequence(tokenizer, with_prefix_space=True, max_length=20)
|
||||
# sequence = " " + sequence # To be sure the byte-level tokenizers are feeling good
|
||||
@@ -1345,12 +1405,14 @@ class TokenizerTesterMixin:
|
||||
def test_prepare_for_model(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
string_sequence = "Testing the prepare_for_model method."
|
||||
ids = tokenizer.encode(string_sequence, add_special_tokens=False)
|
||||
input_dict = tokenizer.encode_plus(string_sequence)
|
||||
prepared_input_dict = tokenizer.prepare_for_model(ids)
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
string_sequence = "Testing the prepare_for_model method."
|
||||
ids = tokenizer.encode(string_sequence, add_special_tokens=False)
|
||||
prepared_input_dict = tokenizer.prepare_for_model(ids, add_special_tokens=True)
|
||||
|
||||
self.assertEqual(input_dict, prepared_input_dict)
|
||||
input_dict = tokenizer.encode_plus(string_sequence, add_special_tokens=True)
|
||||
|
||||
self.assertEqual(input_dict, prepared_input_dict)
|
||||
|
||||
def test_batch_encode_plus_overflowing_tokens(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
|
||||
@@ -25,6 +25,7 @@ from .test_tokenization_common import TokenizerTesterMixin
|
||||
class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = CTRLTokenizer
|
||||
test_rust_tokenizer = False
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
@@ -23,9 +23,8 @@ from .test_tokenization_bert import BertTokenizationTest
|
||||
class DistilBertTokenizationTest(BertTokenizationTest):
|
||||
|
||||
tokenizer_class = DistilBertTokenizer
|
||||
|
||||
def get_rust_tokenizer(self, **kwargs):
|
||||
return DistilBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
||||
rust_tokenizer_class = DistilBertTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
@slow
|
||||
def test_sequence_builders(self):
|
||||
|
||||
@@ -32,25 +32,22 @@ from .test_tokenization_bert import BertTokenizationTest
|
||||
class DPRContextEncoderTokenizationTest(BertTokenizationTest):
|
||||
|
||||
tokenizer_class = DPRContextEncoderTokenizer
|
||||
|
||||
def get_rust_tokenizer(self, **kwargs):
|
||||
return DPRContextEncoderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
||||
rust_tokenizer_class = DPRContextEncoderTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
|
||||
class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
|
||||
|
||||
tokenizer_class = DPRQuestionEncoderTokenizer
|
||||
|
||||
def get_rust_tokenizer(self, **kwargs):
|
||||
return DPRQuestionEncoderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
||||
rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
|
||||
class DPRReaderTokenizationTest(BertTokenizationTest):
|
||||
|
||||
tokenizer_class = DPRReaderTokenizer
|
||||
|
||||
def get_rust_tokenizer(self, **kwargs):
|
||||
return DPRReaderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
||||
rust_tokenizer_class = DPRReaderTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
@slow
|
||||
def test_decode_best_spans(self):
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -26,6 +26,7 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = FunnelTokenizer
|
||||
test_rust_tokenizer = True
|
||||
space_between_special_tokens = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
@@ -26,6 +26,7 @@ from .test_tokenization_common import TokenizerTesterMixin
|
||||
class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = GPT2Tokenizer
|
||||
rust_tokenizer_class = GPT2TokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
|
||||
@@ -18,7 +18,7 @@ import os
|
||||
import unittest
|
||||
|
||||
from transformers.tokenization_bert import VOCAB_FILES_NAMES
|
||||
from transformers.tokenization_lxmert import LxmertTokenizer
|
||||
from transformers.tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
@@ -26,6 +26,9 @@ from .test_tokenization_common import TokenizerTesterMixin
|
||||
class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = LxmertTokenizer
|
||||
rust_tokenizer_class = LxmertTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
space_between_special_tokens = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
@@ -49,9 +52,6 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return LxmertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self, tokenizer):
|
||||
input_text = "UNwant\u00E9d,running"
|
||||
output_text = "unwanted, running"
|
||||
@@ -63,3 +63,25 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokens = tokenizer.tokenize("UNwant\u00E9d,running")
|
||||
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
|
||||
|
||||
def test_rust_and_python_full_tokenizers(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
|
||||
sequence = "I was born in 92000, and this is falsé."
|
||||
|
||||
tokens = tokenizer.tokenize(sequence)
|
||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||
self.assertListEqual(tokens, rust_tokens)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
ids = tokenizer.encode(sequence)
|
||||
rust_ids = rust_tokenizer.encode(sequence)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
@@ -38,6 +38,7 @@ FRAMEWORK = "pt" if _torch_available else "tf"
|
||||
class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = MarianTokenizer
|
||||
test_rust_tokenizer = False
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer, is_torch_available
|
||||
from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer, MBartTokenizerFast, is_torch_available
|
||||
from transformers.testing_utils import require_torch
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
@@ -17,6 +17,8 @@ RO_CODE = 250020
|
||||
|
||||
class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer_class = MBartTokenizer
|
||||
rust_tokenizer_class = MBartTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
@@ -18,7 +18,7 @@ import json
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer
|
||||
from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin
|
||||
class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = OpenAIGPTTokenizer
|
||||
rust_tokenizer_class = OpenAIGPTTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
@@ -3,7 +3,7 @@ from pathlib import Path
|
||||
|
||||
from transformers.file_utils import cached_property
|
||||
from transformers.testing_utils import require_torch
|
||||
from transformers.tokenization_pegasus import PegasusTokenizer
|
||||
from transformers.tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
@@ -11,6 +11,8 @@ from .test_tokenization_common import TokenizerTesterMixin
|
||||
class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = PegasusTokenizer
|
||||
rust_tokenizer_class = PegasusTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
|
||||
from transformers.file_utils import cached_property
|
||||
from transformers.testing_utils import require_torch, slow
|
||||
from transformers.tokenization_reformer import SPIECE_UNDERLINE, ReformerTokenizer
|
||||
from transformers.tokenization_reformer import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
@@ -30,6 +30,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
|
||||
class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = ReformerTokenizer
|
||||
rust_tokenizer_class = ReformerTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
@@ -37,6 +39,28 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
|
||||
def test_rust_and_python_full_tokenizers(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
|
||||
sequence = "I was born in 92000, and this is falsé."
|
||||
|
||||
tokens = tokenizer.tokenize(sequence)
|
||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||
self.assertListEqual(tokens, rust_tokens)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
ids = tokenizer.encode(sequence)
|
||||
rust_ids = rust_tokenizer.encode(sequence)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||
|
||||
|
||||
@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer_class = RobertaTokenizer
|
||||
rust_tokenizer_class = RobertaTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
@@ -20,13 +20,12 @@ import unittest
|
||||
from transformers import BatchEncoding
|
||||
from transformers.file_utils import cached_property
|
||||
from transformers.testing_utils import _torch_available
|
||||
from transformers.tokenization_t5 import T5Tokenizer
|
||||
from transformers.tokenization_t5 import T5Tokenizer, T5TokenizerFast
|
||||
from transformers.tokenization_xlnet import SPIECE_UNDERLINE
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
|
||||
SPIECE_UNDERLINE = "▁"
|
||||
|
||||
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
|
||||
|
||||
FRAMEWORK = "pt" if _torch_available else "tf"
|
||||
@@ -35,6 +34,8 @@ FRAMEWORK = "pt" if _torch_available else "tf"
|
||||
class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = T5Tokenizer
|
||||
rust_tokenizer_class = T5TokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
@@ -113,6 +114,38 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def t5_base_tokenizer(self):
|
||||
return T5Tokenizer.from_pretrained("t5-base")
|
||||
|
||||
@cached_property
|
||||
def t5_base_tokenizer_fast(self):
|
||||
return T5TokenizerFast.from_pretrained("t5-base")
|
||||
|
||||
def get_tokenizer(self, **kwargs) -> T5Tokenizer:
|
||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
|
||||
|
||||
def get_rust_tokenizer(self, **kwargs) -> T5TokenizerFast:
|
||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
|
||||
|
||||
def test_rust_and_python_full_tokenizers(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
|
||||
sequence = "I was born in 92000, and this is falsé."
|
||||
|
||||
tokens = tokenizer.tokenize(sequence)
|
||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||
self.assertListEqual(tokens, rust_tokens)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
ids = tokenizer.encode(sequence)
|
||||
rust_ids = rust_tokenizer.encode(sequence)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
def test_eos_treatment(self):
|
||||
tokenizer = self.t5_base_tokenizer
|
||||
batch_with_eos_added = tokenizer(["hi</s>", "I went to the gym</s>", "</s>"])
|
||||
|
||||
@@ -17,20 +17,15 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.testing_utils import require_torch
|
||||
from transformers.tokenization_transfo_xl import VOCAB_FILES_NAMES, TransfoXLTokenizer
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from transformers.tokenization_transfo_xl import VOCAB_FILES_NAMES, TransfoXLTokenizer
|
||||
|
||||
|
||||
@require_torch
|
||||
class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
|
||||
tokenizer_class = TransfoXLTokenizer
|
||||
test_rust_tokenizer = False
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
@@ -27,6 +27,7 @@ from .test_tokenization_common import TokenizerTesterMixin
|
||||
class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = XLMTokenizer
|
||||
test_rust_tokenizer = False
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
@@ -19,7 +19,7 @@ import unittest
|
||||
|
||||
from transformers.file_utils import cached_property
|
||||
from transformers.testing_utils import slow
|
||||
from transformers.tokenization_xlm_roberta import SPIECE_UNDERLINE, XLMRobertaTokenizer
|
||||
from transformers.tokenization_xlm_roberta import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
@@ -30,6 +30,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
|
||||
class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = XLMRobertaTokenizer
|
||||
rust_tokenizer_class = XLMRobertaTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
@@ -118,6 +120,28 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def big_tokenizer(self):
|
||||
return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
|
||||
|
||||
def test_rust_and_python_full_tokenizers(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
|
||||
sequence = "I was born in 92000, and this is falsé."
|
||||
|
||||
tokens = tokenizer.tokenize(sequence)
|
||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
||||
self.assertListEqual(tokens, rust_tokens)
|
||||
|
||||
ids = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
rust_tokenizer = self.get_rust_tokenizer()
|
||||
ids = tokenizer.encode(sequence)
|
||||
rust_ids = rust_tokenizer.encode(sequence)
|
||||
self.assertListEqual(ids, rust_ids)
|
||||
|
||||
@slow
|
||||
def test_tokenization_base_easy_symbols(self):
|
||||
symbols = "Hello World!"
|
||||
|
||||
@@ -18,7 +18,7 @@ import os
|
||||
import unittest
|
||||
|
||||
from transformers.testing_utils import slow
|
||||
from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
|
||||
from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
@@ -29,12 +29,15 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
|
||||
class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
tokenizer_class = XLNetTokenizer
|
||||
rust_tokenizer_class = XLNetTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
# We have a SentencePiece fixture for testing
|
||||
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||
tokenizer.sanitize_special_tokens()
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
|
||||
Reference in New Issue
Block a user