Add sentencepiece to BertJapaneseTokenizer (#19769)

* support sentencepiece for bertjapanesetokenizer

* add test vocab file for sentencepiece, bertjapanesetokenizer

* make BasicTokenizer be identical to transformers.models.bert.tokenization_bert.BasicTokenizer

* fix missing of \n in comment

* fix init argument missing in tests

* make spm_file be optional, exclude spiece.model from tests/fixtures, and add description comments

* make comment length less than 119

* apply doc style check
This commit is contained in:
Hao Wang
2022-10-21 23:04:49 +09:00
committed by GitHub
parent 2ebf4e6a7b
commit 31565ff0fd
2 changed files with 146 additions and 30 deletions

View File

@@ -334,6 +334,16 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"])
def test_sentencepiece_tokenizer(self):
tokenizer = BertJapaneseTokenizer.from_pretrained("nlp-waseda/roberta-base-japanese-with-auto-jumanpp")
subword_tokenizer = tokenizer.subword_tokenizer
tokens = subword_tokenizer.tokenize("国境 の 長い トンネル を 抜ける と 雪国 であった 。")
self.assertListEqual(tokens, ["▁国境", "▁の", "▁長い", "▁トンネル", "▁を", "▁抜ける", "▁と", "▁雪", "", "▁であった", "▁。"])
tokens = subword_tokenizer.tokenize("こんばんは こんばん にち は こんにちは")
self.assertListEqual(tokens, ["▁こん", "ばん", "", "▁こん", "ばん", "▁に", "", "▁は", "▁こんにちは"])
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("cl-tohoku/bert-base-japanese")