From 375801d5e65457d2ef86e6b269703c5f9ad67252 Mon Sep 17 00:00:00 2001 From: Hao Wang <50416856+conan1024hao@users.noreply.github.com> Date: Sat, 31 Dec 2022 15:22:26 +0900 Subject: [PATCH] update pyknp to rhoknp (#20890) * update pyknp to rhoknp * fix linter * fix linter * fix linter * fix linter * fix linter * support rhoknp==1.1.0, fix testcase --- setup.py | 4 ++-- src/transformers/dependency_versions_table.py | 2 +- .../bert_japanese/tokenization_bert_japanese.py | 14 ++++++++------ src/transformers/utils/import_utils.py | 2 +- .../test_tokenization_bert_japanese.py | 9 +++++++++ 5 files changed, 21 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index 91ee364202..aa7be0f6cb 100644 --- a/setup.py +++ b/setup.py @@ -176,7 +176,7 @@ _deps = [ "beautifulsoup4", "sudachipy>=0.6.6", "sudachidict_core>=20220729", - "pyknp>=0.6.1", + "rhoknp>=1.1.0", ] @@ -245,7 +245,7 @@ class DepsTableUpdateCommand(Command): extras = {} -extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "pyknp") +extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "rhoknp") extras["sklearn"] = deps_list("scikit-learn") extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp") diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 75261f414e..b81734043f 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -82,5 +82,5 @@ deps = { "beautifulsoup4": "beautifulsoup4", "sudachipy": "sudachipy>=0.6.6", "sudachidict_core": "sudachidict_core>=20220729", - "pyknp": "pyknp>=0.6.1", + "rhoknp": "rhoknp>=1.1.0", } diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index a2aa459bd2..27d66ae9a9 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -647,25 +647,27 @@ class JumanppTokenizer: self.trim_whitespace = trim_whitespace try: - import pyknp + import rhoknp except ImportError: raise ImportError( - "You need to install pyknp to use JumanppTokenizer. " - "See https://github.com/ku-nlp/pyknp for installation." + "You need to install rhoknp to use JumanppTokenizer. " + "See https://github.com/ku-nlp/rhoknp for installation." ) - self.juman = pyknp.Juman(jumanpp=True) + self.juman = rhoknp.Jumanpp() def tokenize(self, text, never_split=None, **kwargs): """Tokenizes a piece of text.""" if self.normalize_text: text = unicodedata.normalize("NFKC", text) + text = text.strip() + never_split = self.never_split + (never_split if never_split is not None else []) tokens = [] - for mrph in self.juman.analysis(text).mrph_list(): - token = mrph.midasi + for mrph in self.juman.apply_to_sentence(text).morphemes: + token = mrph.text if self.do_lower_case and token not in never_split: token = token.lower() diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index d092697452..6fb76385da 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -726,7 +726,7 @@ def is_sudachi_available(): def is_jumanpp_available(): - return (importlib.util.find_spec("pyknp") is not None) and (shutil.which("jumanpp") is not None) + return (importlib.util.find_spec("rhoknp") is not None) and (shutil.which("jumanpp") is not None) # docstyle-ignore diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py index 7e89c36b7a..038a334ceb 100644 --- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py +++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py @@ -318,6 +318,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "。"], ) + @require_jumanpp + def test_jumanpp_tokenizer_ext(self): + tokenizer = JumanppTokenizer() + + self.assertListEqual( + tokenizer.tokenize("ありがとうございますm(_ _)m見つけるのが大変です。"), + ["ありがとう", "ございます", "m(_ _)m", "見つける", "の", "が", "大変です", "。"], + ) + def test_wordpiece_tokenizer(self): vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"]