Add sentencepiece to BertJapaneseTokenizer (#19769)
* support sentencepiece for bertjapanesetokenizer * add test vocab file for sentencepiece, bertjapanesetokenizer * make BasicTokenizer be identical to transformers.models.bert.tokenization_bert.BasicTokenizer * fix missing of \n in comment * fix init argument missing in tests * make spm_file be optional, exclude spiece.model from tests/fixtures, and add description comments * make comment length less than 119 * apply doc style check
This commit is contained in:
@@ -334,6 +334,16 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"])
|
||||
|
||||
def test_sentencepiece_tokenizer(self):
|
||||
tokenizer = BertJapaneseTokenizer.from_pretrained("nlp-waseda/roberta-base-japanese-with-auto-jumanpp")
|
||||
subword_tokenizer = tokenizer.subword_tokenizer
|
||||
|
||||
tokens = subword_tokenizer.tokenize("国境 の 長い トンネル を 抜ける と 雪国 であった 。")
|
||||
self.assertListEqual(tokens, ["▁国境", "▁の", "▁長い", "▁トンネル", "▁を", "▁抜ける", "▁と", "▁雪", "国", "▁であった", "▁。"])
|
||||
|
||||
tokens = subword_tokenizer.tokenize("こんばんは こんばん にち は こんにちは")
|
||||
self.assertListEqual(tokens, ["▁こん", "ばん", "は", "▁こん", "ばん", "▁に", "ち", "▁は", "▁こんにちは"])
|
||||
|
||||
def test_sequence_builders(self):
|
||||
tokenizer = self.tokenizer_class.from_pretrained("cl-tohoku/bert-base-japanese")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user