BertJapaneseTokenizer accept options for mecab (#3566)

* BertJapaneseTokenizer accept options for mecab

* black

* fix mecab_option to Option[str]
This commit is contained in:
Yohei Tamura
2020-04-04 00:12:19 +09:00
committed by GitHub
parent 216e167ce6
commit 8594dd80dd
2 changed files with 22 additions and 3 deletions

View File

@@ -91,6 +91,20 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
["アップルストア", "", "iphone", "8", "", "発売", "", "", "", ""],
)
def test_mecab_tokenizer_with_option(self):
try:
tokenizer = MecabTokenizer(
do_lower_case=True, normalize_text=False, mecab_option="-d /usr/local/lib/mecab/dic/jumandic"
)
except RuntimeError:
# if dict doesn't exist in the system, previous code raises this error.
return
self.assertListEqual(
tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),
["アップルストア", "", "iPhone", "", "", "発売", "", "れた", "\u3000", ""],
)
def test_mecab_tokenizer_no_normalize(self):
tokenizer = MecabTokenizer(normalize_text=False)