[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)
* splitting fast and slow tokenizers [WIP] * [WIP] splitting sentencepiece and tokenizers dependencies * update dummy objects * add name_or_path to models and tokenizers * prefix added to file names * prefix * styling + quality * spliting all the tokenizer files - sorting sentencepiece based ones * update tokenizer version up to 0.9.0 * remove hard dependency on sentencepiece 🎉 * and removed hard dependency on tokenizers 🎉 * update conversion script * update missing models * fixing tests * move test_tokenization_fast to main tokenization tests - fix bugs * bump up tokenizers * fix bert_generation * update ad fix several tokenizers * keep sentencepiece in deps for now * fix funnel and deberta tests * fix fsmt * fix marian tests * fix layoutlm * fix squeezebert and gpt2 * fix T5 tokenization * fix xlnet tests * style * fix mbart * bump up tokenizers to 0.9.2 * fix model tests * fix tf models * fix seq2seq examples * fix tests without sentencepiece * fix slow => fast conversion without sentencepiece * update auto and bert generation tests * fix mbart tests * fix auto and common test without tokenizers * fix tests without tokenizers * clean up tests lighten up when tokenizers + sentencepiece are both off * style quality and tests fixing * add sentencepiece to doc/examples reqs * leave sentencepiece on for now * style quality split hebert and fix pegasus * WIP Herbert fast * add sample_text_no_unicode and fix hebert tokenization * skip FSMT example test for now * fix style * fix fsmt in example tests * update following Lysandre and Sylvain's comments * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -14,16 +14,18 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import inspect
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from collections import OrderedDict
|
||||
from itertools import takewhile
|
||||
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
||||
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
|
||||
from transformers.testing_utils import require_tf, require_torch, slow
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast, is_torch_available
|
||||
from transformers.testing_utils import get_tests_dir, require_tf, require_tokenizers, require_torch, slow
|
||||
from transformers.tokenization_utils import AddedToken
|
||||
|
||||
|
||||
@@ -31,6 +33,18 @@ if TYPE_CHECKING:
|
||||
from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
|
||||
|
||||
|
||||
NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
|
||||
|
||||
|
||||
def filter_non_english(_, pretrained_name: str):
|
||||
""" Filter all the model for non-english language """
|
||||
return not any([lang in pretrained_name for lang in NON_ENGLISH_TAGS])
|
||||
|
||||
|
||||
def filter_roberta_detectors(_, pretrained_name: str):
|
||||
return "detector" not in pretrained_name
|
||||
|
||||
|
||||
def merge_model_tokenizer_mappings(
|
||||
model_mapping: Dict["PretrainedConfig", Union["PreTrainedModel", "TFPreTrainedModel"]],
|
||||
tokenizer_mapping: Dict["PretrainedConfig", Tuple["PreTrainedTokenizer", "PreTrainedTokenizerFast"]],
|
||||
@@ -59,8 +73,32 @@ class TokenizerTesterMixin:
|
||||
rust_tokenizer_class = None
|
||||
test_rust_tokenizer = False
|
||||
space_between_special_tokens = False
|
||||
from_pretrained_kwargs = None
|
||||
from_pretrained_filter = None
|
||||
from_pretrained_vocab_key = "vocab_file"
|
||||
|
||||
def setUp(self) -> None:
|
||||
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
|
||||
# information available in Tokenizer (name, rust class, python class, vocab key name)
|
||||
if self.test_rust_tokenizer:
|
||||
tokenizers_list = [
|
||||
(
|
||||
self.rust_tokenizer_class,
|
||||
pretrained_name,
|
||||
self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
|
||||
)
|
||||
for pretrained_name in self.rust_tokenizer_class.pretrained_vocab_files_map[
|
||||
self.from_pretrained_vocab_key
|
||||
].keys()
|
||||
if self.from_pretrained_filter is None
|
||||
or (self.from_pretrained_filter is not None and self.from_pretrained_filter(pretrained_name))
|
||||
]
|
||||
self.tokenizers_list = tokenizers_list[:1] # Let's just test the first pretrained vocab for speed
|
||||
else:
|
||||
self.tokenizers_list = []
|
||||
with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
|
||||
self._data = f_data.read().replace("\n\n", "\n").strip()
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
def tearDown(self):
|
||||
@@ -123,6 +161,15 @@ class TokenizerTesterMixin:
|
||||
for i in range(len(batch_encode_plus_sequences["input_ids"]))
|
||||
]
|
||||
|
||||
def test_rust_tokenizer_signature(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
signature = inspect.signature(self.rust_tokenizer_class.__init__)
|
||||
|
||||
self.assertIn("tokenizer_file", signature.parameters)
|
||||
self.assertIsNone(signature.parameters["tokenizer_file"].default)
|
||||
|
||||
def test_rust_and_python_full_tokenizers(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
@@ -206,7 +253,6 @@ class TokenizerTesterMixin:
|
||||
|
||||
shutil.rmtree(tmpdirname)
|
||||
|
||||
# Now let's start the test
|
||||
tokenizers = self.get_tokenizers(model_max_length=42)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
@@ -237,6 +283,39 @@ class TokenizerTesterMixin:
|
||||
|
||||
shutil.rmtree(tmpdirname)
|
||||
|
||||
# Test that we can also use the non-legacy saving format for fast tokenizers
|
||||
tokenizers = self.get_tokenizers(model_max_length=42)
|
||||
for tokenizer in tokenizers:
|
||||
if not tokenizer.is_fast:
|
||||
continue
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
# Isolate this from the other tests because we save additional tokens/etc
|
||||
tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
sample_text = " He is very happy, UNwant\u00E9d,running"
|
||||
tokenizer.add_tokens(["bim", "bambam"])
|
||||
additional_special_tokens = tokenizer.additional_special_tokens
|
||||
additional_special_tokens.append("new_additional_special_token")
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
|
||||
before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
|
||||
before_vocab = tokenizer.get_vocab()
|
||||
tokenizer.save_pretrained(tmpdirname)
|
||||
|
||||
after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
|
||||
after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
|
||||
after_vocab = after_tokenizer.get_vocab()
|
||||
self.assertListEqual(before_tokens, after_tokens)
|
||||
self.assertDictEqual(before_vocab, after_vocab)
|
||||
self.assertIn("bim", after_vocab)
|
||||
self.assertIn("bambam", after_vocab)
|
||||
self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens)
|
||||
self.assertEqual(after_tokenizer.model_max_length, 42)
|
||||
|
||||
tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
|
||||
self.assertEqual(tokenizer.model_max_length, 43)
|
||||
|
||||
shutil.rmtree(tmpdirname)
|
||||
|
||||
def test_pickle_tokenizer(self):
|
||||
"""Google pickle __getstate__ __setstate__ if you are struggling with this."""
|
||||
tokenizers = self.get_tokenizers()
|
||||
@@ -258,6 +337,7 @@ class TokenizerTesterMixin:
|
||||
|
||||
self.assertListEqual(subwords, subwords_loaded)
|
||||
|
||||
@require_tokenizers
|
||||
def test_pickle_added_tokens(self):
|
||||
tok1 = AddedToken("<s>", rstrip=True, lstrip=True, normalized=False, single_word=True)
|
||||
tok2 = pickle.loads(pickle.dumps(tok1))
|
||||
@@ -419,6 +499,7 @@ class TokenizerTesterMixin:
|
||||
|
||||
self.assertEqual(text_2, output_text)
|
||||
|
||||
@require_tokenizers
|
||||
def test_encode_decode_with_spaces(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
@@ -437,6 +518,15 @@ class TokenizerTesterMixin:
|
||||
self.assertIn(decoded, [output, output.lower()])
|
||||
|
||||
def test_pretrained_model_lists(self):
|
||||
# We should have at least one default checkpoint for each tokenizer
|
||||
# We should specify the max input length as well (used in some part to list the pretrained checkpoints)
|
||||
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
|
||||
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
|
||||
self.assertEqual(
|
||||
len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]),
|
||||
len(self.tokenizer_class.max_model_input_sizes),
|
||||
)
|
||||
|
||||
weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
|
||||
weights_lists_2 = []
|
||||
for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
|
||||
@@ -1226,6 +1316,7 @@ class TokenizerTesterMixin:
|
||||
encoded_sequences_batch_padded_2[key],
|
||||
)
|
||||
|
||||
@require_tokenizers
|
||||
def test_added_token_serializable(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
@@ -1652,3 +1743,772 @@ class TokenizerTesterMixin:
|
||||
self.assertEqual(batch_encoder_only.input_ids.shape[1], 3)
|
||||
self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
|
||||
self.assertNotIn("decoder_input_ids", batch_encoder_only)
|
||||
|
||||
def test_is_fast(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
# Check is_fast is set correctly
|
||||
self.assertFalse(tokenizer_p.is_fast)
|
||||
self.assertTrue(tokenizer_r.is_fast)
|
||||
|
||||
def test_fast_only_inputs(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
# Ensure None raise an error
|
||||
self.assertRaises(TypeError, tokenizer_r.tokenize, None)
|
||||
self.assertRaises(TypeError, tokenizer_r.encode, None)
|
||||
self.assertRaises(TypeError, tokenizer_r.encode_plus, None)
|
||||
self.assertRaises(TypeError, tokenizer_r.batch_encode_plus, None)
|
||||
|
||||
def test_alignement_methods(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
|
||||
text = " ".join(words)
|
||||
batch_size = 3
|
||||
|
||||
encoding = tokenizer_r.encode_plus(text, add_special_tokens=False)
|
||||
|
||||
batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False)
|
||||
num_tokens = len(encoding["input_ids"])
|
||||
|
||||
last_word_index = len(words) - 1
|
||||
last_token_index = num_tokens - 1
|
||||
last_batch_index = batch_size - 1
|
||||
last_char_index = len(text) - 1
|
||||
|
||||
# words, tokens
|
||||
self.assertEqual(len(encoding.words(0)), num_tokens)
|
||||
self.assertEqual(max(encoding.words(0)), last_word_index)
|
||||
self.assertEqual(min(encoding.words(0)), 0)
|
||||
self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens)
|
||||
self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index)
|
||||
self.assertEqual(min(batch_encoding.words(last_batch_index)), 0)
|
||||
self.assertEqual(len(encoding.tokens(0)), num_tokens)
|
||||
|
||||
# Assert token_to_word
|
||||
self.assertEqual(encoding.token_to_word(0), 0)
|
||||
self.assertEqual(encoding.token_to_word(0, 0), 0)
|
||||
self.assertEqual(encoding.token_to_word(last_token_index), last_word_index)
|
||||
self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index)
|
||||
self.assertEqual(batch_encoding.token_to_word(1, 0), 0)
|
||||
self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index)
|
||||
self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index)
|
||||
|
||||
# Assert word_to_tokens
|
||||
self.assertEqual(encoding.word_to_tokens(0).start, 0)
|
||||
self.assertEqual(encoding.word_to_tokens(0, 0).start, 0)
|
||||
self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1)
|
||||
self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
|
||||
self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0)
|
||||
self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
|
||||
self.assertEqual(
|
||||
batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1
|
||||
)
|
||||
|
||||
# Assert token_to_chars
|
||||
self.assertEqual(encoding.token_to_chars(0).start, 0)
|
||||
self.assertEqual(encoding.token_to_chars(0, 0).start, 0)
|
||||
self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1)
|
||||
self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
|
||||
self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0)
|
||||
self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
|
||||
self.assertEqual(
|
||||
batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1
|
||||
)
|
||||
|
||||
# Assert char_to_token
|
||||
self.assertEqual(encoding.char_to_token(0), 0)
|
||||
self.assertEqual(encoding.char_to_token(0, 0), 0)
|
||||
self.assertEqual(encoding.char_to_token(last_char_index), last_token_index)
|
||||
self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index)
|
||||
self.assertEqual(batch_encoding.char_to_token(1, 0), 0)
|
||||
self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index)
|
||||
self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index)
|
||||
|
||||
# Assert char_to_word
|
||||
self.assertEqual(encoding.char_to_word(0), 0)
|
||||
self.assertEqual(encoding.char_to_word(0, 0), 0)
|
||||
self.assertEqual(encoding.char_to_word(last_char_index), last_word_index)
|
||||
self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index)
|
||||
self.assertEqual(batch_encoding.char_to_word(1, 0), 0)
|
||||
self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index)
|
||||
self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index)
|
||||
|
||||
# Assert word_to_chars
|
||||
self.assertEqual(encoding.word_to_chars(0).start, 0)
|
||||
self.assertEqual(encoding.word_to_chars(0, 0).start, 0)
|
||||
self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1)
|
||||
self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
|
||||
self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0)
|
||||
self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
|
||||
self.assertEqual(
|
||||
batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1
|
||||
)
|
||||
|
||||
def test_tokenization_python_rust_equals(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
# Ensure basic input match
|
||||
input_p = tokenizer_p.encode_plus(self._data)
|
||||
input_r = tokenizer_r.encode_plus(self._data)
|
||||
|
||||
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
|
||||
self.assertSequenceEqual(input_p[key], input_r[key])
|
||||
|
||||
input_pairs_p = tokenizer_p.encode_plus(self._data, self._data)
|
||||
input_pairs_r = tokenizer_r.encode_plus(self._data, self._data)
|
||||
|
||||
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
|
||||
self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
|
||||
|
||||
# Ensure truncation match
|
||||
input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True)
|
||||
input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True)
|
||||
|
||||
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
|
||||
self.assertSequenceEqual(input_p[key], input_r[key])
|
||||
|
||||
# Ensure truncation with stride match
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
|
||||
)
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
|
||||
)
|
||||
|
||||
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
|
||||
self.assertSequenceEqual(input_p[key], input_r[key][0])
|
||||
|
||||
def test_num_special_tokens_to_add_equal(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
# Check we have the same number of added_tokens for both pair and non-pair inputs.
|
||||
self.assertEqual(
|
||||
tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False)
|
||||
)
|
||||
self.assertEqual(
|
||||
tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True)
|
||||
)
|
||||
|
||||
def test_max_length_equal(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
# Check we have the correct max_length for both pair and non-pair inputs.
|
||||
self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
|
||||
self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
|
||||
|
||||
def test_special_tokens_map_equal(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
# Assert the set of special tokens match.
|
||||
self.assertSequenceEqual(
|
||||
tokenizer_p.special_tokens_map.items(),
|
||||
tokenizer_r.special_tokens_map.items(),
|
||||
)
|
||||
|
||||
def test_add_tokens(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
vocab_size = len(tokenizer_r)
|
||||
self.assertEqual(tokenizer_r.add_tokens(""), 0)
|
||||
self.assertEqual(tokenizer_r.add_tokens("testoken"), 1)
|
||||
self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2)
|
||||
self.assertEqual(len(tokenizer_r), vocab_size + 3)
|
||||
|
||||
self.assertEqual(tokenizer_r.add_special_tokens({}), 0)
|
||||
self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2)
|
||||
self.assertRaises(
|
||||
AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
|
||||
)
|
||||
self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1)
|
||||
self.assertEqual(
|
||||
tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2
|
||||
)
|
||||
self.assertEqual(len(tokenizer_r), vocab_size + 8)
|
||||
|
||||
def test_offsets_mapping(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
text = "Wonderful no inspiration example with subtoken"
|
||||
pair = "Along with an awesome pair"
|
||||
|
||||
# No pair
|
||||
tokens_with_offsets = tokenizer_r.encode_plus(
|
||||
text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
|
||||
)
|
||||
added_tokens = tokenizer_r.num_special_tokens_to_add(False)
|
||||
offsets = tokens_with_offsets["offset_mapping"]
|
||||
|
||||
# Assert there is the same number of tokens and offsets
|
||||
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
|
||||
|
||||
# Assert there is online added_tokens special_tokens
|
||||
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
|
||||
|
||||
# Pairs
|
||||
tokens_with_offsets = tokenizer_r.encode_plus(
|
||||
text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
|
||||
)
|
||||
added_tokens = tokenizer_r.num_special_tokens_to_add(True)
|
||||
offsets = tokens_with_offsets["offset_mapping"]
|
||||
|
||||
# Assert there is the same number of tokens and offsets
|
||||
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
|
||||
|
||||
# Assert there is online added_tokens special_tokens
|
||||
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
|
||||
|
||||
def test_batch_encode_dynamic_overflowing(self):
|
||||
"""
|
||||
When calling batch_encode with multiple sequence it can returns different number of
|
||||
overflowing encoding for each sequence:
|
||||
[
|
||||
Sequence 1: [Encoding 1, Encoding 2],
|
||||
Sequence 2: [Encoding 1],
|
||||
Sequence 3: [Encoding 1, Encoding 2, ... Encoding N]
|
||||
]
|
||||
This needs to be padded so that it can represented as a tensor
|
||||
"""
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
with self.subTest(
|
||||
"{} ({}, {})".format(tokenizer.__class__.__name__, pretrained_name, tokenizer.__class__.__name__)
|
||||
):
|
||||
|
||||
returned_tensor = "pt" if is_torch_available() else "tf"
|
||||
|
||||
if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
|
||||
return
|
||||
|
||||
tokens = tokenizer.encode_plus(
|
||||
"HuggingFace is solving NLP one commit at a time",
|
||||
max_length=6,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors=returned_tensor,
|
||||
return_overflowing_tokens=True,
|
||||
)
|
||||
|
||||
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
|
||||
self.assertEqual(len(tokens[key].shape), 2)
|
||||
|
||||
# Mono sample
|
||||
tokens = tokenizer.batch_encode_plus(
|
||||
["HuggingFace is solving NLP one commit at a time"],
|
||||
max_length=6,
|
||||
padding=True,
|
||||
truncation="only_first",
|
||||
return_tensors=returned_tensor,
|
||||
return_overflowing_tokens=True,
|
||||
)
|
||||
|
||||
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
|
||||
self.assertEqual(len(tokens[key].shape), 2)
|
||||
self.assertEqual(tokens[key].shape[-1], 6)
|
||||
|
||||
# Multi sample
|
||||
tokens = tokenizer.batch_encode_plus(
|
||||
["HuggingFace is solving NLP one commit at a time", "Very tiny input"],
|
||||
max_length=6,
|
||||
padding=True,
|
||||
truncation="only_first",
|
||||
return_tensors=returned_tensor,
|
||||
return_overflowing_tokens=True,
|
||||
)
|
||||
|
||||
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
|
||||
self.assertEqual(len(tokens[key].shape), 2)
|
||||
self.assertEqual(tokens[key].shape[-1], 6)
|
||||
|
||||
def test_compare_pretokenized_inputs(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space:
|
||||
continue # Too hard to test for now
|
||||
|
||||
# Input string
|
||||
pretokenized_input_simple = "This is a sample input".split()
|
||||
pretokenized_input_pair = "This is a sample pair".split()
|
||||
|
||||
# Test encode for pretokenized inputs
|
||||
output_r = tokenizer_r.encode(
|
||||
pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
|
||||
)
|
||||
output_p = tokenizer_p.encode(
|
||||
pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
|
||||
)
|
||||
self.assertEqual(output_p, output_r)
|
||||
|
||||
kwargs = {
|
||||
"is_split_into_words": True,
|
||||
# "return_token_type_ids": True, # Use the defaults for each tokenizers
|
||||
# "return_attention_mask": True, # Use the defaults for each tokenizers
|
||||
"return_overflowing_tokens": False,
|
||||
"return_special_tokens_mask": True,
|
||||
"return_offsets_mapping": False, # Not implemented in python tokenizers
|
||||
# "add_special_tokens": False,
|
||||
}
|
||||
batch_kwargs = {
|
||||
"is_split_into_words": True,
|
||||
# "return_token_type_ids": True, # Use the defaults for each tokenizers
|
||||
# "return_attention_mask": True, # Use the defaults for each tokenizers
|
||||
"return_overflowing_tokens": False,
|
||||
"return_special_tokens_mask": True,
|
||||
"return_offsets_mapping": False, # Not implemented in python tokenizers
|
||||
# "add_special_tokens": False,
|
||||
}
|
||||
# Test encode_plus for pretokenized inputs
|
||||
output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs)
|
||||
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs)
|
||||
for key in output_p.keys():
|
||||
self.assertEqual(output_p[key], output_r[key])
|
||||
|
||||
# Test batch_encode_plus for pretokenized inputs
|
||||
input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair]
|
||||
output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs)
|
||||
output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs)
|
||||
for key in output_p.keys():
|
||||
self.assertEqual(output_p[key], output_r[key])
|
||||
|
||||
# Test encode for pretokenized inputs pairs
|
||||
output_r = tokenizer_r.encode(
|
||||
pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
|
||||
)
|
||||
output_p = tokenizer_p.encode(
|
||||
pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
|
||||
)
|
||||
self.assertEqual(output_p, output_r)
|
||||
|
||||
# Test encode_plus for pretokenized inputs
|
||||
output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
|
||||
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
|
||||
for key in output_p.keys():
|
||||
self.assertEqual(output_p[key], output_r[key])
|
||||
|
||||
# Test batch_encode_plus for pretokenized inputs
|
||||
input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [
|
||||
pretokenized_input_simple + pretokenized_input_pair,
|
||||
pretokenized_input_pair,
|
||||
]
|
||||
output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs)
|
||||
output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs)
|
||||
for key in output_p.keys():
|
||||
self.assertEqual(output_p[key], output_r[key])
|
||||
|
||||
def test_create_token_type_ids(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
input_simple = [1, 2, 3]
|
||||
input_pair = [1, 2, 3]
|
||||
|
||||
# Generate output
|
||||
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple)
|
||||
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple)
|
||||
self.assertEqual(output_p, output_r)
|
||||
|
||||
# Generate pair output
|
||||
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair)
|
||||
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair)
|
||||
self.assertEqual(output_p, output_r)
|
||||
|
||||
def test_build_inputs_with_special_tokens(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
# # Input string
|
||||
# input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
|
||||
# input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
|
||||
|
||||
# # Generate output
|
||||
# output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
|
||||
# output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
|
||||
# self.assertEqual(output_p, output_r)
|
||||
|
||||
# # Generate pair output
|
||||
# output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
|
||||
# output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
|
||||
# self.assertEqual(output_p, output_r)
|
||||
|
||||
# Input tokens id
|
||||
input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False)
|
||||
input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False)
|
||||
|
||||
# Generate output
|
||||
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
|
||||
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
|
||||
self.assertEqual(output_p, output_r)
|
||||
|
||||
# Generate pair output
|
||||
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
|
||||
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
|
||||
self.assertEqual(output_p, output_r)
|
||||
|
||||
def test_padding(self, max_length=50):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
def assert_padded_input_match(input_r: list, input_p: list, max_length: int):
|
||||
|
||||
# Ensure we match max_length
|
||||
self.assertEqual(len(input_r), max_length)
|
||||
self.assertEqual(len(input_p), max_length)
|
||||
|
||||
# Ensure the number of padded tokens is the same
|
||||
padded_tokens_r = list(takewhile(lambda i: i == tokenizer_r.pad_token_id, reversed(input_r)))
|
||||
padded_tokens_p = list(takewhile(lambda i: i == tokenizer_p.pad_token_id, reversed(input_p)))
|
||||
self.assertSequenceEqual(padded_tokens_r, padded_tokens_p)
|
||||
|
||||
def assert_batch_padded_input_match(input_r: dict, input_p: dict, max_length: int):
|
||||
for i_r in input_r.values():
|
||||
self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
|
||||
len(i_r[1]), max_length
|
||||
)
|
||||
self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
|
||||
len(i_r[1]), max_length
|
||||
)
|
||||
|
||||
for i_r, i_p in zip(input_r["input_ids"], input_p["input_ids"]):
|
||||
assert_padded_input_match(i_r, i_p, max_length)
|
||||
|
||||
for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]):
|
||||
self.assertSequenceEqual(i_r, i_p)
|
||||
|
||||
# Encode - Simple input
|
||||
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
|
||||
assert_padded_input_match(input_r, input_p, max_length)
|
||||
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
|
||||
assert_padded_input_match(input_r, input_p, max_length)
|
||||
|
||||
input_r = tokenizer_r.encode("This is a simple input", padding="longest")
|
||||
input_p = tokenizer_p.encode("This is a simple input", padding=True)
|
||||
assert_padded_input_match(input_r, input_p, len(input_r))
|
||||
|
||||
# Encode - Pair input
|
||||
input_r = tokenizer_r.encode(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
assert_padded_input_match(input_r, input_p, max_length)
|
||||
input_r = tokenizer_r.encode(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
|
||||
)
|
||||
input_p = tokenizer_p.encode(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
|
||||
)
|
||||
assert_padded_input_match(input_r, input_p, max_length)
|
||||
input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True)
|
||||
input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest")
|
||||
assert_padded_input_match(input_r, input_p, len(input_r))
|
||||
|
||||
# Encode_plus - Simple input
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
"This is a simple input", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", max_length=max_length, padding="max_length"
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
"This is a simple input", max_length=max_length, padding="max_length"
|
||||
)
|
||||
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
|
||||
input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest")
|
||||
input_p = tokenizer_p.encode_plus("This is a simple input", padding=True)
|
||||
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
|
||||
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
|
||||
# Encode_plus - Pair input
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
|
||||
)
|
||||
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest")
|
||||
input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True)
|
||||
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
|
||||
# Batch_encode_plus - Simple input
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
assert_batch_padded_input_match(input_r, input_p, max_length)
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
padding="max_length",
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
padding="max_length",
|
||||
)
|
||||
assert_batch_padded_input_match(input_r, input_p, max_length)
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
padding="longest",
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
padding=True,
|
||||
)
|
||||
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"], padding="longest"
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"], padding=True
|
||||
)
|
||||
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
|
||||
|
||||
# Batch_encode_plus - Pair input
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
[
|
||||
("This is a simple input 1", "This is a simple input 2"),
|
||||
("This is a simple pair 1", "This is a simple pair 2"),
|
||||
],
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding="max_length",
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
[
|
||||
("This is a simple input 1", "This is a simple input 2"),
|
||||
("This is a simple pair 1", "This is a simple pair 2"),
|
||||
],
|
||||
max_length=max_length,
|
||||
truncation=True,
|
||||
padding="max_length",
|
||||
)
|
||||
assert_batch_padded_input_match(input_r, input_p, max_length)
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
[
|
||||
("This is a simple input 1", "This is a simple input 2"),
|
||||
("This is a simple pair 1", "This is a simple pair 2"),
|
||||
],
|
||||
padding=True,
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
[
|
||||
("This is a simple input 1", "This is a simple input 2"),
|
||||
("This is a simple pair 1", "This is a simple pair 2"),
|
||||
],
|
||||
padding="longest",
|
||||
)
|
||||
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
|
||||
|
||||
# Using pad on single examples after tokenization
|
||||
input_r = tokenizer_r.encode_plus("This is a input 1")
|
||||
input_r = tokenizer_r.pad(input_r)
|
||||
|
||||
input_p = tokenizer_r.encode_plus("This is a input 1")
|
||||
input_p = tokenizer_r.pad(input_p)
|
||||
|
||||
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
|
||||
|
||||
# Using pad on single examples after tokenization
|
||||
input_r = tokenizer_r.encode_plus("This is a input 1")
|
||||
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
|
||||
|
||||
input_p = tokenizer_r.encode_plus("This is a input 1")
|
||||
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
|
||||
|
||||
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
|
||||
|
||||
# Using pad after tokenization
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
["This is a input 1", "This is a much longer input whilch should be padded"]
|
||||
)
|
||||
input_r = tokenizer_r.pad(input_r)
|
||||
|
||||
input_p = tokenizer_r.batch_encode_plus(
|
||||
["This is a input 1", "This is a much longer input whilch should be padded"]
|
||||
)
|
||||
input_p = tokenizer_r.pad(input_p)
|
||||
|
||||
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
|
||||
|
||||
# Using pad after tokenization
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
["This is a input 1", "This is a much longer input whilch should be padded"]
|
||||
)
|
||||
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
|
||||
|
||||
input_p = tokenizer_r.batch_encode_plus(
|
||||
["This is a input 1", "This is a much longer input whilch should be padded"]
|
||||
)
|
||||
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
|
||||
|
||||
assert_batch_padded_input_match(input_r, input_p, max_length)
|
||||
|
||||
def test_save_pretrained(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
# Checks it save with the same files
|
||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
|
||||
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
def test_embeded_special_tokens(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
sentence = "A, <mask> AllenNLP sentence."
|
||||
tokens_r = tokenizer_r.encode_plus(
|
||||
sentence,
|
||||
add_special_tokens=True,
|
||||
)
|
||||
tokens_p = tokenizer_p.encode_plus(
|
||||
sentence,
|
||||
add_special_tokens=True,
|
||||
)
|
||||
|
||||
for key in tokens_p.keys():
|
||||
self.assertEqual(tokens_r[key], tokens_p[key])
|
||||
|
||||
if "token_type_ids" in tokens_r:
|
||||
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
|
||||
|
||||
tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
|
||||
tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
|
||||
self.assertSequenceEqual(tokens_r, tokens_p)
|
||||
|
||||
def test_compare_add_special_tokens(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
|
||||
# pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
|
||||
|
||||
for text in ["", " "]:
|
||||
# tokenize()
|
||||
no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
|
||||
with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
|
||||
self.assertEqual(
|
||||
len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
|
||||
)
|
||||
|
||||
# encode()
|
||||
no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False)
|
||||
with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True)
|
||||
self.assertEqual(
|
||||
len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
|
||||
)
|
||||
|
||||
# encode_plus()
|
||||
no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
|
||||
with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
|
||||
for key in no_special_tokens.keys():
|
||||
self.assertEqual(
|
||||
len(no_special_tokens[key]),
|
||||
len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
|
||||
)
|
||||
|
||||
# # batch_encode_plus
|
||||
no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
|
||||
with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True)
|
||||
for key in no_special_tokens.keys():
|
||||
for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
|
||||
self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
|
||||
|
||||
def test_compare_prepare_for_model(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
string_sequence = "Asserting that both tokenizers are equal"
|
||||
python_output = tokenizer_p.prepare_for_model(
|
||||
tokenizer_p.encode(string_sequence, add_special_tokens=False)
|
||||
)
|
||||
rust_output = tokenizer_r.prepare_for_model(
|
||||
tokenizer_r.encode(string_sequence, add_special_tokens=False)
|
||||
)
|
||||
for key in python_output:
|
||||
self.assertEqual(python_output[key], rust_output[key])
|
||||
|
||||
Reference in New Issue
Block a user