Add regression tests for slow sentencepiece tokenizers. (#11737)
* add test_vocab_size for sentencepiece tok. * add test_get_vocab for sentencepiece tok. * add test_convert_token_and_id for sentencepiece tok. * add test_tokenize_and_convert_tokens_to_string for all tok. * improve test_tokenize_and_convert_tokens_to_string for sp. tok. * add common tokenizer integration tests - for albert - for barthez * add tokenizer integration tests to bert gen. * add most tokenizer integration tests * fix camembert tokenizer integration test * add tokenizer integration test to marian * add tokenizer integration test to reformer * add typing and doc to tokenizer_integration_test_util * fix tokenizer integration test of reformer * improve test_sentencepiece_tokenize_and_convert_tokens_to_string * empty commit to trigger CI * fix tokenizer integration test of reformer * remove code not needed anymore * empty commit to trigger CI * empty commit to trigger CI
This commit is contained in:
@@ -24,7 +24,7 @@ import tempfile
|
||||
import unittest
|
||||
from collections import OrderedDict
|
||||
from itertools import takewhile
|
||||
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
|
||||
|
||||
from huggingface_hub import HfApi
|
||||
from requests.exceptions import HTTPError
|
||||
@@ -175,13 +175,74 @@ class TokenizerTesterMixin:
|
||||
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
|
||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
# def get_input_output_texts(self) -> Tuple[str, str]:
|
||||
# """Feel free to overwrite"""
|
||||
# # TODO: @property
|
||||
# return (
|
||||
# "This is a test",
|
||||
# "This is a test",
|
||||
# )
|
||||
def tokenizer_integration_test_util(
|
||||
self,
|
||||
expected_encoding: Dict,
|
||||
model_name: str,
|
||||
revision: str = None,
|
||||
sequences: List[str] = None,
|
||||
decode_kwargs: Dict[str, Any] = None,
|
||||
padding: bool = True,
|
||||
):
|
||||
"""
|
||||
Util for integration test.
|
||||
|
||||
Text is tokenized and then reverted back to text. Both results are then checked.
|
||||
|
||||
Args:
|
||||
expected_encoding:
|
||||
The expected result of the tokenizer output.
|
||||
model_name:
|
||||
The model name of the tokenizer to load and use.
|
||||
revision:
|
||||
The full git revision number of the model. This is to pin the
|
||||
tokenizer config and to avoid that tests start to fail if the
|
||||
config gets changed upstream.
|
||||
sequences:
|
||||
Can overwrite the texts that are used to check the tokenizer.
|
||||
This is useful if the tokenizer supports non english languages
|
||||
like france.
|
||||
decode_kwargs:
|
||||
Additional args for the ``decode`` function which reverts the
|
||||
tokenized text back to a string.
|
||||
padding:
|
||||
Activates and controls padding of the tokenizer.
|
||||
"""
|
||||
decode_kwargs = {} if decode_kwargs is None else decode_kwargs
|
||||
|
||||
if sequences is None:
|
||||
sequences = [
|
||||
"Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
|
||||
"general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
|
||||
"Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained "
|
||||
"models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
|
||||
"BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
|
||||
"conditioning on both left and right context in all layers.",
|
||||
"The quick brown fox jumps over the lazy dog.",
|
||||
]
|
||||
|
||||
tokenizer_classes = [self.tokenizer_class]
|
||||
if self.test_rust_tokenizer:
|
||||
tokenizer_classes.append(self.rust_tokenizer_class)
|
||||
|
||||
for tokenizer_class in tokenizer_classes:
|
||||
tokenizer = tokenizer_class.from_pretrained(
|
||||
model_name,
|
||||
revision=revision, # to pin the tokenizer version
|
||||
)
|
||||
|
||||
encoding = tokenizer(sequences, padding=padding)
|
||||
decoded_sequences = [
|
||||
tokenizer.decode(seq, skip_special_tokens=True, **decode_kwargs) for seq in encoding["input_ids"]
|
||||
]
|
||||
|
||||
encoding_data = encoding.data
|
||||
self.assertDictEqual(encoding_data, expected_encoding)
|
||||
|
||||
for expected, decoded in zip(sequences, decoded_sequences):
|
||||
if self.test_sentencepiece_ignore_case:
|
||||
expected = expected.lower()
|
||||
self.assertEqual(expected, decoded)
|
||||
|
||||
def assert_padded_input_match(self, input_r: list, input_p: list, max_length: int, pad_token_id: int):
|
||||
# Ensure we match max_length
|
||||
@@ -224,6 +285,30 @@ class TokenizerTesterMixin:
|
||||
for i in range(len(batch_encode_plus_sequences["input_ids"]))
|
||||
]
|
||||
|
||||
# TODO: this test could be extended to all tokenizers - not just the sentencepiece
|
||||
def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
|
||||
"""Test ``_tokenize`` and ``convert_tokens_to_string``."""
|
||||
if not self.test_sentencepiece:
|
||||
return
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
text = "This is text to test the tokenizer."
|
||||
|
||||
if self.test_sentencepiece_ignore_case:
|
||||
text = text.lower()
|
||||
|
||||
tokens = tokenizer.tokenize(text)
|
||||
|
||||
self.assertTrue(len(tokens) > 0)
|
||||
|
||||
# check if converting back to original text works
|
||||
reverse_text = tokenizer.convert_tokens_to_string(tokens)
|
||||
|
||||
if self.test_sentencepiece_ignore_case:
|
||||
reverse_text = reverse_text.lower()
|
||||
|
||||
self.assertEqual(reverse_text, text)
|
||||
|
||||
def test_subword_regularization_tokenizer(self) -> None:
|
||||
if not self.test_sentencepiece:
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user