Transformer-XL: Improved tokenization with sacremoses (#6322)
* Improved tokenization with sacremoses * The TransfoXLTokenizer is now using sacremoses for tokenization * Added tokenization of comma-separated and floating point numbers. * Removed prepare_for_tokenization() from tokenization_transfo_xl.py because punctuation is handled by sacremoses * Added corresponding tests * Removed test comapring TransfoXLTokenizer and TransfoXLTokenizerFast * Added deprecation warning to TransfoXLTokenizerFast * isort change Co-authored-by: Teven <teven.lescao@gmail.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
@@ -12,14 +12,12 @@ from transformers import (
|
||||
OpenAIGPTTokenizer,
|
||||
PreTrainedTokenizer,
|
||||
RobertaTokenizer,
|
||||
TransfoXLTokenizer,
|
||||
is_torch_available,
|
||||
)
|
||||
from transformers.testing_utils import get_tests_dir, require_torch
|
||||
from transformers.tokenization_distilbert import DistilBertTokenizerFast
|
||||
from transformers.tokenization_openai import OpenAIGPTTokenizerFast
|
||||
from transformers.tokenization_roberta import RobertaTokenizerFast
|
||||
from transformers.tokenization_transfo_xl import TransfoXLTokenizerFast
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -895,17 +893,3 @@ class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
|
||||
max_length=max_length,
|
||||
padding="max_length",
|
||||
)
|
||||
|
||||
|
||||
class TransfoXLFastTokenizerTest(NoPaddingTokenFastTokenizerMatchingTest):
|
||||
TOKENIZERS_CLASSES = frozenset(
|
||||
[Tokenizer("TransfoXL", TransfoXLTokenizerFast, TransfoXLTokenizer, "pretrained_vocab_file", None, None)]
|
||||
)
|
||||
|
||||
@require_torch
|
||||
def test_all_tokenizers(self):
|
||||
super().test_all_tokenizers()
|
||||
|
||||
@require_torch
|
||||
def test_pretokenized_tokenizers(self):
|
||||
super().test_pretokenized_tokenizers()
|
||||
|
||||
@@ -83,6 +83,44 @@ class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.tokenize(" \tHeLLo ! how \n Are yoU ? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
|
||||
)
|
||||
|
||||
def test_full_tokenizer_moses_numbers(self):
|
||||
tokenizer = TransfoXLTokenizer(lower_case=False)
|
||||
text_in = "Hello (bracket) and side-scrolled [and] Henry's $5,000 with 3.34 m. What's up!?"
|
||||
tokens_out = [
|
||||
"Hello",
|
||||
"(",
|
||||
"bracket",
|
||||
")",
|
||||
"and",
|
||||
"side",
|
||||
"@-@",
|
||||
"scrolled",
|
||||
"[",
|
||||
"and",
|
||||
"]",
|
||||
"Henry",
|
||||
"'s",
|
||||
"$",
|
||||
"5",
|
||||
"@,@",
|
||||
"000",
|
||||
"with",
|
||||
"3",
|
||||
"@.@",
|
||||
"34",
|
||||
"m",
|
||||
".",
|
||||
"What",
|
||||
"'s",
|
||||
"up",
|
||||
"!",
|
||||
"?",
|
||||
]
|
||||
|
||||
self.assertListEqual(tokenizer.tokenize(text_in), tokens_out)
|
||||
|
||||
self.assertEqual(tokenizer.convert_tokens_to_string(tokens_out), text_in)
|
||||
|
||||
def test_move_added_token(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
original_len = len(tokenizer)
|
||||
|
||||
Reference in New Issue
Block a user