Add sudachi and jumanpp tokenizers for bert_japanese (#19043)

* add sudachipy and jumanpp tokenizers for bert_japanese

* use ImportError instead of ModuleNotFoundError in SudachiTokenizer and JumanppTokenizer

* put test cases of test_tokenization_bert_japanese in one line

* add require_sudachi and require_jumanpp decorator for testing

* add sudachi and pyknp(jumanpp) to dependencies

* remove sudachi_dict_small and sudachi_dict_full from dependencies

* empty commit for ci
This commit is contained in:
r-terada
2022-10-06 00:41:37 +09:00
committed by GitHub
parent 60db81ff60
commit 2f53ab5745
8 changed files with 373 additions and 7 deletions

View File

@@ -52,6 +52,7 @@ from .utils import (
is_flax_available,
is_ftfy_available,
is_ipex_available,
is_jumanpp_available,
is_librosa_available,
is_onnx_available,
is_pandas_available,
@@ -66,6 +67,7 @@ from .utils import (
is_sentencepiece_available,
is_soundfile_availble,
is_spacy_available,
is_sudachi_available,
is_tensorflow_probability_available,
is_tensorflow_text_available,
is_tf2onnx_available,
@@ -671,6 +673,20 @@ def require_usr_bin_time(test_case):
return unittest.skipUnless(cmd_exists("/usr/bin/time"), "test requires /usr/bin/time")(test_case)
def require_sudachi(test_case):
"""
Decorator marking a test that requires sudachi
"""
return unittest.skipUnless(is_sudachi_available(), "test requires sudachi")(test_case)
def require_jumanpp(test_case):
"""
Decorator marking a test that requires jumanpp
"""
return unittest.skipUnless(is_jumanpp_available(), "test requires jumanpp")(test_case)
def get_gpu_count():
"""
Return the number of available gpus (regardless of whether torch, tf or jax is used)