Support additional dictionaries for BERT Japanese tokenizers (#6515)

* Update BERT Japanese tokenizers

* Update CircleCI config to download unidic

* Specify to use the latest dictionary packages
This commit is contained in:
Masatoshi Suzuki
2020-08-17 13:00:23 +09:00
committed by GitHub
parent 423eb5b1d7
commit 48c6c6139f
4 changed files with 97 additions and 15 deletions

View File

@@ -87,16 +87,38 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
def test_mecab_tokenizer(self):
tokenizer = MecabTokenizer()
def test_mecab_tokenizer_ipadic(self):
tokenizer = MecabTokenizer(mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップルストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_unidic_lite(self):
try:
tokenizer = MecabTokenizer(mecab_dic="unidic_lite")
except ModuleNotFoundError:
return
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_unidic(self):
try:
tokenizer = MecabTokenizer(mecab_dic="unidic")
except ModuleNotFoundError:
return
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_lower(self):
tokenizer = MecabTokenizer(do_lower_case=True)
tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
@@ -118,7 +140,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
)
def test_mecab_tokenizer_no_normalize(self):
tokenizer = MecabTokenizer(normalize_text=False)
tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic")
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),