Transformer-XL: Improved tokenization with sacremoses (#6322)

* Improved tokenization with sacremoses

 * The TransfoXLTokenizer is now using sacremoses for tokenization
 * Added tokenization of comma-separated and floating point numbers.
 * Removed prepare_for_tokenization() from tokenization_transfo_xl.py because punctuation is handled by sacremoses
 * Added corresponding tests
 * Removed test comapring TransfoXLTokenizer and TransfoXLTokenizerFast
 * Added deprecation warning to TransfoXLTokenizerFast

* isort change

Co-authored-by: Teven <teven.lescao@gmail.com>
Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
RafaelWO
2020-08-28 15:56:17 +02:00
committed by GitHub
parent 930153e7d2
commit cb276b41de
3 changed files with 127 additions and 34 deletions

View File

@@ -12,14 +12,12 @@ from transformers import (
OpenAIGPTTokenizer,
PreTrainedTokenizer,
RobertaTokenizer,
TransfoXLTokenizer,
is_torch_available,
)
from transformers.testing_utils import get_tests_dir, require_torch
from transformers.tokenization_distilbert import DistilBertTokenizerFast
from transformers.tokenization_openai import OpenAIGPTTokenizerFast
from transformers.tokenization_roberta import RobertaTokenizerFast
from transformers.tokenization_transfo_xl import TransfoXLTokenizerFast
logger = logging.getLogger(__name__)
@@ -895,17 +893,3 @@ class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
max_length=max_length,
padding="max_length",
)
class TransfoXLFastTokenizerTest(NoPaddingTokenFastTokenizerMatchingTest):
TOKENIZERS_CLASSES = frozenset(
[Tokenizer("TransfoXL", TransfoXLTokenizerFast, TransfoXLTokenizer, "pretrained_vocab_file", None, None)]
)
@require_torch
def test_all_tokenizers(self):
super().test_all_tokenizers()
@require_torch
def test_pretokenized_tokenizers(self):
super().test_pretokenized_tokenizers()

View File

@@ -83,6 +83,44 @@ class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.tokenize(" \tHeLLo ! how \n Are yoU ? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
)
def test_full_tokenizer_moses_numbers(self):
tokenizer = TransfoXLTokenizer(lower_case=False)
text_in = "Hello (bracket) and side-scrolled [and] Henry's $5,000 with 3.34 m. What's up!?"
tokens_out = [
"Hello",
"(",
"bracket",
")",
"and",
"side",
"@-@",
"scrolled",
"[",
"and",
"]",
"Henry",
"'s",
"$",
"5",
"@,@",
"000",
"with",
"3",
"@.@",
"34",
"m",
".",
"What",
"'s",
"up",
"!",
"?",
]
self.assertListEqual(tokenizer.tokenize(text_in), tokens_out)
self.assertEqual(tokenizer.convert_tokens_to_string(tokens_out), text_in)
def test_move_added_token(self):
tokenizer = self.get_tokenizer()
original_len = len(tokenizer)