Add sudachi_projection option to BertJapaneseTokenizer (#28503)

* add sudachi_projection option

* Upgrade sudachipy>=0.6.8

* add a test case for sudachi_projection

* Compatible with older versions of SudachiPy

* make fixup

* make style

* error message for unidic download

* revert jumanpp test cases

* format options for sudachi_projection

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* format options for sudachi_split_mode and sudachi_dict_type

* comment

* add tests for full_tokenizer kwargs

* pass projection arg directly

* require_sudachi_projection

* make style

* revert upgrade sudachipy

* check is_sudachi_projection_available()

* revert dependency_version_table and bugfix

* style format

* simply raise ImportError

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* simply raise ImportError

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Hiroshi Matsuda
2024-02-13 03:47:20 +00:00
committed by GitHub
parent b44567538b
commit da20209dbc
5 changed files with 109 additions and 15 deletions

View File

@@ -29,7 +29,7 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import (
SudachiTokenizer,
WordpieceTokenizer,
)
from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi
from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection
from ...test_tokenization_common import TokenizerTesterMixin
@@ -60,6 +60,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"##、",
"",
"##。",
"アップルストア",
"外国",
"##人",
"参政",
"##権",
"此れ",
"",
"",
"です",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
@@ -113,6 +122,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens, tokens_loaded)
def test_mecab_full_tokenizer_with_mecab_kwargs(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="mecab", mecab_kwargs={"mecab_dic": "ipadic"}
)
text = "アップルストア"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["アップルストア"])
def test_mecab_tokenizer_ipadic(self):
tokenizer = MecabTokenizer(mecab_dic="ipadic")
@@ -134,6 +152,12 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_mecab_tokenizer_unidic(self):
try:
import unidic
self.assertTrue(
os.path.isdir(unidic.DICDIR),
"The content of unidic was not downloaded. Run `python -m unidic download` before running this test case. Note that this requires 2.1GB on disk.",
)
tokenizer = MecabTokenizer(mecab_dic="unidic")
except ModuleNotFoundError:
return
@@ -173,7 +197,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
["アップルストア", "", "iPhone", "", "", "発売", "", "", "", " ", ""],
)
@require_sudachi
@require_sudachi_projection
def test_pickle_sudachi_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi")
self.assertIsNotNone(tokenizer)
@@ -194,7 +218,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens, tokens_loaded)
@require_sudachi
@require_sudachi_projection
def test_sudachi_tokenizer_core(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core")
@@ -205,37 +229,61 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
)
# fmt: on
@require_sudachi
@require_sudachi_projection
def test_sudachi_tokenizer_split_mode_A(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="A")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "", "参政", ""])
@require_sudachi
@require_sudachi_projection
def test_sudachi_tokenizer_split_mode_B(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="B")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人", "参政権"])
@require_sudachi
@require_sudachi_projection
def test_sudachi_tokenizer_split_mode_C(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="C")
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人参政権"])
@require_sudachi
@require_sudachi_projection
def test_sudachi_full_tokenizer_with_sudachi_kwargs_split_mode_B(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_split_mode": "B"}
)
self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "##人", "参政", "##権"])
@require_sudachi_projection
def test_sudachi_tokenizer_projection(self):
tokenizer = SudachiTokenizer(
sudachi_dict_type="core", sudachi_split_mode="A", sudachi_projection="normalized_nouns"
)
self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "", "", "です", ""])
@require_sudachi_projection
def test_sudachi_full_tokenizer_with_sudachi_kwargs_sudachi_projection(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_projection": "normalized_nouns"}
)
self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "", "", "です", ""])
@require_sudachi_projection
def test_sudachi_tokenizer_lower(self):
tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core")
self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),[" ", "\t", "アップル", "ストア", "", "iphone", "8", " ", "", " ", " ", "\n ", "発売", "", "", "", " ", "", " ", " "]) # fmt: skip
@require_sudachi
@require_sudachi_projection
def test_sudachi_tokenizer_no_normalize(self):
tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core")
self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone\n 発売された 。 "),[" ", "\t", "アップル", "ストア", "", "iPhone", "", " ", "", " ", " ", "\n ", "発売", "", "", "", "\u3000", "", " ", " "]) # fmt: skip
@require_sudachi
@require_sudachi_projection
def test_sudachi_tokenizer_trim_whitespace(self):
tokenizer = SudachiTokenizer(trim_whitespace=True, sudachi_dict_type="core")
@@ -293,6 +341,17 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
["アップル", "ストア", "", "iPhone", "8", "", "発売", "", "れた", ""],
)
@require_jumanpp
def test_jumanpp_full_tokenizer_with_jumanpp_kwargs_trim_whitespace(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="jumanpp", jumanpp_kwargs={"trim_whitespace": True}
)
text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
@require_jumanpp
def test_jumanpp_tokenizer_ext(self):
tokenizer = JumanppTokenizer()