update pyknp to rhoknp (#20890)

* update pyknp to rhoknp

* fix linter

* fix linter

* fix linter

* fix linter

* fix linter

* support rhoknp==1.1.0, fix testcase
This commit is contained in:
Hao Wang
2022-12-31 15:22:26 +09:00
committed by GitHub
parent 092d4d49dd
commit 375801d5e6
5 changed files with 21 additions and 10 deletions

View File

@@ -176,7 +176,7 @@ _deps = [
"beautifulsoup4", "beautifulsoup4",
"sudachipy>=0.6.6", "sudachipy>=0.6.6",
"sudachidict_core>=20220729", "sudachidict_core>=20220729",
"pyknp>=0.6.1", "rhoknp>=1.1.0",
] ]
@@ -245,7 +245,7 @@ class DepsTableUpdateCommand(Command):
extras = {} extras = {}
extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "pyknp") extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "rhoknp")
extras["sklearn"] = deps_list("scikit-learn") extras["sklearn"] = deps_list("scikit-learn")
extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp") extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")

View File

@@ -82,5 +82,5 @@ deps = {
"beautifulsoup4": "beautifulsoup4", "beautifulsoup4": "beautifulsoup4",
"sudachipy": "sudachipy>=0.6.6", "sudachipy": "sudachipy>=0.6.6",
"sudachidict_core": "sudachidict_core>=20220729", "sudachidict_core": "sudachidict_core>=20220729",
"pyknp": "pyknp>=0.6.1", "rhoknp": "rhoknp>=1.1.0",
} }

View File

@@ -647,25 +647,27 @@ class JumanppTokenizer:
self.trim_whitespace = trim_whitespace self.trim_whitespace = trim_whitespace
try: try:
import pyknp import rhoknp
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"You need to install pyknp to use JumanppTokenizer. " "You need to install rhoknp to use JumanppTokenizer. "
"See https://github.com/ku-nlp/pyknp for installation." "See https://github.com/ku-nlp/rhoknp for installation."
) )
self.juman = pyknp.Juman(jumanpp=True) self.juman = rhoknp.Jumanpp()
def tokenize(self, text, never_split=None, **kwargs): def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text.""" """Tokenizes a piece of text."""
if self.normalize_text: if self.normalize_text:
text = unicodedata.normalize("NFKC", text) text = unicodedata.normalize("NFKC", text)
text = text.strip()
never_split = self.never_split + (never_split if never_split is not None else []) never_split = self.never_split + (never_split if never_split is not None else [])
tokens = [] tokens = []
for mrph in self.juman.analysis(text).mrph_list(): for mrph in self.juman.apply_to_sentence(text).morphemes:
token = mrph.midasi token = mrph.text
if self.do_lower_case and token not in never_split: if self.do_lower_case and token not in never_split:
token = token.lower() token = token.lower()

View File

@@ -726,7 +726,7 @@ def is_sudachi_available():
def is_jumanpp_available(): def is_jumanpp_available():
return (importlib.util.find_spec("pyknp") is not None) and (shutil.which("jumanpp") is not None) return (importlib.util.find_spec("rhoknp") is not None) and (shutil.which("jumanpp") is not None)
# docstyle-ignore # docstyle-ignore

View File

@@ -318,6 +318,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "れた", ""], ["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "れた", ""],
) )
@require_jumanpp
def test_jumanpp_tokenizer_ext(self):
tokenizer = JumanppTokenizer()
self.assertListEqual(
tokenizer.tokenize("ありがとうございますm(_ _)m見つけるのが大変です。"),
["ありがとう", "ございます", "m(_ _)m", "見つける", "", "", "大変です", ""],
)
def test_wordpiece_tokenizer(self): def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"] vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"]