BertJapaneseTokenizer accept options for mecab (#3566)
* BertJapaneseTokenizer accept options for mecab * black * fix mecab_option to Option[str]
This commit is contained in:
@@ -91,6 +91,20 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
["アップルストア", "で", "iphone", "8", "が", "発売", "さ", "れ", "た", "。"],
|
||||
)
|
||||
|
||||
def test_mecab_tokenizer_with_option(self):
|
||||
try:
|
||||
tokenizer = MecabTokenizer(
|
||||
do_lower_case=True, normalize_text=False, mecab_option="-d /usr/local/lib/mecab/dic/jumandic"
|
||||
)
|
||||
except RuntimeError:
|
||||
# if dict doesn't exist in the system, previous code raises this error.
|
||||
return
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),
|
||||
["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "\u3000", "。"],
|
||||
)
|
||||
|
||||
def test_mecab_tokenizer_no_normalize(self):
|
||||
tokenizer = MecabTokenizer(normalize_text=False)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user