update pyknp to rhoknp (#20890)
* update pyknp to rhoknp * fix linter * fix linter * fix linter * fix linter * fix linter * support rhoknp==1.1.0, fix testcase
This commit is contained in:
4
setup.py
4
setup.py
@@ -176,7 +176,7 @@ _deps = [
|
|||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
"sudachipy>=0.6.6",
|
"sudachipy>=0.6.6",
|
||||||
"sudachidict_core>=20220729",
|
"sudachidict_core>=20220729",
|
||||||
"pyknp>=0.6.1",
|
"rhoknp>=1.1.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -245,7 +245,7 @@ class DepsTableUpdateCommand(Command):
|
|||||||
|
|
||||||
extras = {}
|
extras = {}
|
||||||
|
|
||||||
extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "pyknp")
|
extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "sudachipy", "sudachidict_core", "rhoknp")
|
||||||
extras["sklearn"] = deps_list("scikit-learn")
|
extras["sklearn"] = deps_list("scikit-learn")
|
||||||
|
|
||||||
extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
|
extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
|
||||||
|
|||||||
@@ -82,5 +82,5 @@ deps = {
|
|||||||
"beautifulsoup4": "beautifulsoup4",
|
"beautifulsoup4": "beautifulsoup4",
|
||||||
"sudachipy": "sudachipy>=0.6.6",
|
"sudachipy": "sudachipy>=0.6.6",
|
||||||
"sudachidict_core": "sudachidict_core>=20220729",
|
"sudachidict_core": "sudachidict_core>=20220729",
|
||||||
"pyknp": "pyknp>=0.6.1",
|
"rhoknp": "rhoknp>=1.1.0",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -647,25 +647,27 @@ class JumanppTokenizer:
|
|||||||
self.trim_whitespace = trim_whitespace
|
self.trim_whitespace = trim_whitespace
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import pyknp
|
import rhoknp
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"You need to install pyknp to use JumanppTokenizer. "
|
"You need to install rhoknp to use JumanppTokenizer. "
|
||||||
"See https://github.com/ku-nlp/pyknp for installation."
|
"See https://github.com/ku-nlp/rhoknp for installation."
|
||||||
)
|
)
|
||||||
|
|
||||||
self.juman = pyknp.Juman(jumanpp=True)
|
self.juman = rhoknp.Jumanpp()
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None, **kwargs):
|
def tokenize(self, text, never_split=None, **kwargs):
|
||||||
"""Tokenizes a piece of text."""
|
"""Tokenizes a piece of text."""
|
||||||
if self.normalize_text:
|
if self.normalize_text:
|
||||||
text = unicodedata.normalize("NFKC", text)
|
text = unicodedata.normalize("NFKC", text)
|
||||||
|
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
never_split = self.never_split + (never_split if never_split is not None else [])
|
never_split = self.never_split + (never_split if never_split is not None else [])
|
||||||
tokens = []
|
tokens = []
|
||||||
|
|
||||||
for mrph in self.juman.analysis(text).mrph_list():
|
for mrph in self.juman.apply_to_sentence(text).morphemes:
|
||||||
token = mrph.midasi
|
token = mrph.text
|
||||||
|
|
||||||
if self.do_lower_case and token not in never_split:
|
if self.do_lower_case and token not in never_split:
|
||||||
token = token.lower()
|
token = token.lower()
|
||||||
|
|||||||
@@ -726,7 +726,7 @@ def is_sudachi_available():
|
|||||||
|
|
||||||
|
|
||||||
def is_jumanpp_available():
|
def is_jumanpp_available():
|
||||||
return (importlib.util.find_spec("pyknp") is not None) and (shutil.which("jumanpp") is not None)
|
return (importlib.util.find_spec("rhoknp") is not None) and (shutil.which("jumanpp") is not None)
|
||||||
|
|
||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
|
|||||||
@@ -318,6 +318,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "。"],
|
["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "。"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@require_jumanpp
|
||||||
|
def test_jumanpp_tokenizer_ext(self):
|
||||||
|
tokenizer = JumanppTokenizer()
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
tokenizer.tokenize("ありがとうございますm(_ _)m見つけるのが大変です。"),
|
||||||
|
["ありがとう", "ございます", "m(_ _)m", "見つける", "の", "が", "大変です", "。"],
|
||||||
|
)
|
||||||
|
|
||||||
def test_wordpiece_tokenizer(self):
|
def test_wordpiece_tokenizer(self):
|
||||||
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"]
|
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは", "ばんは", "##こん", "##にちは", "##ばんは"]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user