[fsmt tokenizer] support lowercase tokenizer (#8389)

* support lowercase tokenizer

* fix arg pos
This commit is contained in:
Stas Bekman
2020-11-09 07:41:39 -08:00
committed by GitHub
parent 1e2acd0dcf
commit 78d706f3ae
3 changed files with 23 additions and 1 deletions

View File

@@ -151,6 +151,13 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
decoded_text = tokenizer_dec.decode(encoded_ids, skip_special_tokens=True)
self.assertEqual(decoded_text, src_text)
@slow
def test_tokenizer_lower(self):
tokenizer = FSMTTokenizer.from_pretrained("facebook/wmt19-ru-en", do_lower_case=True)
tokens = tokenizer.tokenize("USA is United States of America")
expected = ["us", "a</w>", "is</w>", "un", "i", "ted</w>", "st", "ates</w>", "of</w>", "am", "er", "ica</w>"]
self.assertListEqual(tokens, expected)
@unittest.skip("FSMTConfig.__init__ requires non-optional args")
def test_torch_encode_plus_sent_to_model(self):
pass