Add sudachi and jumanpp tokenizers for bert_japanese (#19043)

* add sudachipy and jumanpp tokenizers for bert_japanese

* use ImportError instead of ModuleNotFoundError in SudachiTokenizer and JumanppTokenizer

* put test cases of test_tokenization_bert_japanese in one line

* add require_sudachi and require_jumanpp decorator for testing

* add sudachi and pyknp(jumanpp) to dependencies

* remove sudachi_dict_small and sudachi_dict_full from dependencies

* empty commit for ci
This commit is contained in:
r-terada
2022-10-06 00:41:37 +09:00
committed by GitHub
parent 60db81ff60
commit 2f53ab5745
8 changed files with 373 additions and 7 deletions

View File

@@ -170,6 +170,9 @@ _deps = [
"unidic_lite>=1.0.7",
"uvicorn",
"beautifulsoup4",
"sudachipy>=0.6.6",
"sudachidict_core>=20220729",
"pyknp>=0.6.1",
]
@@ -239,7 +242,7 @@ class DepsTableUpdateCommand(Command):
extras = {}
extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic")
extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "pyknp")
extras["sklearn"] = deps_list("scikit-learn")
extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text")