Add regression tests for slow sentencepiece tokenizers. (#11737)

* add test_vocab_size for sentencepiece tok.

* add test_get_vocab for sentencepiece tok.

* add test_convert_token_and_id for sentencepiece tok.

* add test_tokenize_and_convert_tokens_to_string for all tok.

* improve test_tokenize_and_convert_tokens_to_string for sp. tok.

* add common tokenizer integration tests
- for albert
- for barthez

* add tokenizer integration tests to bert gen.

* add most tokenizer integration tests

* fix camembert tokenizer integration test

* add tokenizer integration test to marian

* add tokenizer integration test to reformer

* add typing and doc to tokenizer_integration_test_util

* fix tokenizer integration test of reformer

* improve test_sentencepiece_tokenize_and_convert_tokens_to_string

* empty commit to trigger CI

* fix tokenizer integration test of reformer

* remove code not needed anymore

* empty commit to trigger CI

* empty commit to trigger CI
This commit is contained in:
Philip May
2021-06-01 15:24:39 +02:00
committed by GitHub
parent c3d958b2c0
commit fcad801825
17 changed files with 624 additions and 111 deletions

View File

@@ -24,7 +24,7 @@ import tempfile
import unittest
from collections import OrderedDict
from itertools import takewhile
from typing import TYPE_CHECKING, Dict, List, Tuple, Union
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
from huggingface_hub import HfApi
from requests.exceptions import HTTPError
@@ -175,13 +175,74 @@ class TokenizerTesterMixin:
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
# def get_input_output_texts(self) -> Tuple[str, str]:
# """Feel free to overwrite"""
# # TODO: @property
# return (
# "This is a test",
# "This is a test",
# )
def tokenizer_integration_test_util(
self,
expected_encoding: Dict,
model_name: str,
revision: str = None,
sequences: List[str] = None,
decode_kwargs: Dict[str, Any] = None,
padding: bool = True,
):
"""
Util for integration test.
Text is tokenized and then reverted back to text. Both results are then checked.
Args:
expected_encoding:
The expected result of the tokenizer output.
model_name:
The model name of the tokenizer to load and use.
revision:
The full git revision number of the model. This is to pin the
tokenizer config and to avoid that tests start to fail if the
config gets changed upstream.
sequences:
Can overwrite the texts that are used to check the tokenizer.
This is useful if the tokenizer supports non english languages
like france.
decode_kwargs:
Additional args for the ``decode`` function which reverts the
tokenized text back to a string.
padding:
Activates and controls padding of the tokenizer.
"""
decode_kwargs = {} if decode_kwargs is None else decode_kwargs
if sequences is None:
sequences = [
"Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
"general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
"Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained "
"models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
"BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
"conditioning on both left and right context in all layers.",
"The quick brown fox jumps over the lazy dog.",
]
tokenizer_classes = [self.tokenizer_class]
if self.test_rust_tokenizer:
tokenizer_classes.append(self.rust_tokenizer_class)
for tokenizer_class in tokenizer_classes:
tokenizer = tokenizer_class.from_pretrained(
model_name,
revision=revision, # to pin the tokenizer version
)
encoding = tokenizer(sequences, padding=padding)
decoded_sequences = [
tokenizer.decode(seq, skip_special_tokens=True, **decode_kwargs) for seq in encoding["input_ids"]
]
encoding_data = encoding.data
self.assertDictEqual(encoding_data, expected_encoding)
for expected, decoded in zip(sequences, decoded_sequences):
if self.test_sentencepiece_ignore_case:
expected = expected.lower()
self.assertEqual(expected, decoded)
def assert_padded_input_match(self, input_r: list, input_p: list, max_length: int, pad_token_id: int):
# Ensure we match max_length
@@ -224,6 +285,30 @@ class TokenizerTesterMixin:
for i in range(len(batch_encode_plus_sequences["input_ids"]))
]
# TODO: this test could be extended to all tokenizers - not just the sentencepiece
def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
"""Test ``_tokenize`` and ``convert_tokens_to_string``."""
if not self.test_sentencepiece:
return
tokenizer = self.get_tokenizer()
text = "This is text to test the tokenizer."
if self.test_sentencepiece_ignore_case:
text = text.lower()
tokens = tokenizer.tokenize(text)
self.assertTrue(len(tokens) > 0)
# check if converting back to original text works
reverse_text = tokenizer.convert_tokens_to_string(tokens)
if self.test_sentencepiece_ignore_case:
reverse_text = reverse_text.lower()
self.assertEqual(reverse_text, text)
def test_subword_regularization_tokenizer(self) -> None:
if not self.test_sentencepiece:
return