From da20209dbc26a6a870a6e7be87faa657b571b7bc Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com> Date: Tue, 13 Feb 2024 03:47:20 +0000 Subject: [PATCH] Add sudachi_projection option to BertJapaneseTokenizer (#28503) * add sudachi_projection option * Upgrade sudachipy>=0.6.8 * add a test case for sudachi_projection * Compatible with older versions of SudachiPy * make fixup * make style * error message for unidic download * revert jumanpp test cases * format options for sudachi_projection Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * format options for sudachi_split_mode and sudachi_dict_type * comment * add tests for full_tokenizer kwargs * pass projection arg directly * require_sudachi_projection * make style * revert upgrade sudachipy * check is_sudachi_projection_available() * revert dependency_version_table and bugfix * style format * simply raise ImportError Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * simply raise ImportError --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- .../tokenization_bert_japanese.py | 21 +++-- src/transformers/testing_utils.py | 10 +++ src/transformers/utils/__init__.py | 1 + src/transformers/utils/import_utils.py | 15 +++- .../test_tokenization_bert_japanese.py | 77 ++++++++++++++++--- 5 files changed, 109 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index e0f09c20b2..b2d1ac1958 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -22,7 +22,7 @@ import unicodedata from typing import Any, Dict, List, Optional, Tuple from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace -from ...utils import is_sentencepiece_available, logging +from ...utils import is_sentencepiece_available, is_sudachi_projection_available, logging if is_sentencepiece_available(): @@ -542,6 +542,7 @@ class SudachiTokenizer: sudachi_config_path=None, sudachi_resource_dir=None, sudachi_dict_type="core", + sudachi_projection=None, ): """ Constructs a SudachiTokenizer. @@ -557,11 +558,13 @@ class SudachiTokenizer: **trim_whitespace**: (*optional*) boolean (default False) Whether to trim all whitespace, tab, newline from tokens. **sudachi_split_mode**: (*optional*) string - Split mode of sudachi, choose from "A", "B", "C". + Split mode of sudachi, choose from `["A", "B", "C"]`. **sudachi_config_path**: (*optional*) string **sudachi_resource_dir**: (*optional*) string **sudachi_dict_type**: (*optional*) string - dict type of sudachi, choose from "small", "core", "full". + dict type of sudachi, choose from `["small", "core", "full"]`. + **sudachi_projection**: (*optional*) string + Word projection mode of sudachi, choose from `["surface", "normalized", "reading", "dictionary", "dictionary_and_surface", "normalized_and_surface", "normalized_nouns"]`. """ self.do_lower_case = do_lower_case @@ -586,9 +589,17 @@ class SudachiTokenizer: else: raise ValueError("Invalid sudachi_split_mode is specified.") - self.sudachi = dictionary.Dictionary( + self.projection = sudachi_projection + + sudachi_dictionary = dictionary.Dictionary( config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type - ).create(self.split_mode) + ) + if is_sudachi_projection_available(): + self.sudachi = sudachi_dictionary.create(self.split_mode, projection=self.projection) + elif self.projection is not None: + raise ImportError("You need to install sudachipy>=0.6.8 to specify `projection` field in sudachi_kwargs.") + else: + self.sudachi = sudachi_dictionary.create(self.split_mode) def tokenize(self, text, never_split=None, **kwargs): """Tokenizes a piece of text.""" diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 257948793a..eb74af7a4a 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -95,6 +95,7 @@ from .utils import ( is_soundfile_availble, is_spacy_available, is_sudachi_available, + is_sudachi_projection_available, is_tensorflow_probability_available, is_tensorflow_text_available, is_tf2onnx_available, @@ -1043,6 +1044,15 @@ def require_sudachi(test_case): return unittest.skipUnless(is_sudachi_available(), "test requires sudachi")(test_case) +def require_sudachi_projection(test_case): + """ + Decorator marking a test that requires sudachi_projection + """ + return unittest.skipUnless(is_sudachi_projection_available(), "test requires sudachi which supports projection")( + test_case + ) + + def require_jumanpp(test_case): """ Decorator marking a test that requires jumanpp diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index bb05dd28ef..a608304ac9 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -163,6 +163,7 @@ from .import_utils import ( is_spacy_available, is_speech_available, is_sudachi_available, + is_sudachi_projection_available, is_tensorflow_probability_available, is_tensorflow_text_available, is_tf2onnx_available, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index e0b4fea0e6..501d68b492 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -135,7 +135,7 @@ if _sklearn_available: _smdistributed_available = importlib.util.find_spec("smdistributed") is not None _soundfile_available = _is_package_available("soundfile") _spacy_available = _is_package_available("spacy") -_sudachipy_available = _is_package_available("sudachipy") +_sudachipy_available, _sudachipy_version = _is_package_available("sudachipy", return_version=True) _tensorflow_probability_available = _is_package_available("tensorflow_probability") _tensorflow_text_available = _is_package_available("tensorflow_text") _tf2onnx_available = _is_package_available("tf2onnx") @@ -896,6 +896,19 @@ def is_sudachi_available(): return _sudachipy_available +def get_sudachi_version(): + return _sudachipy_version + + +def is_sudachi_projection_available(): + if not is_sudachi_available(): + return False + + # NOTE: We require sudachipy>=0.6.8 to use projection option in sudachi_kwargs for the constructor of BertJapaneseTokenizer. + # - `projection` option is not supported in sudachipy<0.6.8, see https://github.com/WorksApplications/sudachi.rs/issues/230 + return version.parse(_sudachipy_version) >= version.parse("0.6.8") + + def is_jumanpp_available(): return (importlib.util.find_spec("rhoknp") is not None) and (shutil.which("jumanpp") is not None) diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py index bc78006979..cedf7492cf 100644 --- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py +++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py @@ -29,7 +29,7 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import ( SudachiTokenizer, WordpieceTokenizer, ) -from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi +from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection from ...test_tokenization_common import TokenizerTesterMixin @@ -60,6 +60,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "##、", "。", "##。", + "アップルストア", + "外国", + "##人", + "参政", + "##権", + "此れ", + "は", + "猫", + "です", ] self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) @@ -113,6 +122,15 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertListEqual(tokens, tokens_loaded) + def test_mecab_full_tokenizer_with_mecab_kwargs(self): + tokenizer = self.tokenizer_class( + self.vocab_file, word_tokenizer_type="mecab", mecab_kwargs={"mecab_dic": "ipadic"} + ) + + text = "アップルストア" + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, ["アップルストア"]) + def test_mecab_tokenizer_ipadic(self): tokenizer = MecabTokenizer(mecab_dic="ipadic") @@ -134,6 +152,12 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_mecab_tokenizer_unidic(self): try: + import unidic + + self.assertTrue( + os.path.isdir(unidic.DICDIR), + "The content of unidic was not downloaded. Run `python -m unidic download` before running this test case. Note that this requires 2.1GB on disk.", + ) tokenizer = MecabTokenizer(mecab_dic="unidic") except ModuleNotFoundError: return @@ -173,7 +197,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", " ", "。"], ) - @require_sudachi + @require_sudachi_projection def test_pickle_sudachi_tokenizer(self): tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi") self.assertIsNotNone(tokenizer) @@ -194,7 +218,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertListEqual(tokens, tokens_loaded) - @require_sudachi + @require_sudachi_projection def test_sudachi_tokenizer_core(self): tokenizer = SudachiTokenizer(sudachi_dict_type="core") @@ -205,37 +229,61 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ) # fmt: on - @require_sudachi + @require_sudachi_projection def test_sudachi_tokenizer_split_mode_A(self): tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="A") self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "人", "参政", "権"]) - @require_sudachi + @require_sudachi_projection def test_sudachi_tokenizer_split_mode_B(self): tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="B") self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人", "参政権"]) - @require_sudachi + @require_sudachi_projection def test_sudachi_tokenizer_split_mode_C(self): tokenizer = SudachiTokenizer(sudachi_dict_type="core", sudachi_split_mode="C") self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国人参政権"]) - @require_sudachi + @require_sudachi_projection + def test_sudachi_full_tokenizer_with_sudachi_kwargs_split_mode_B(self): + tokenizer = self.tokenizer_class( + self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_split_mode": "B"} + ) + + self.assertListEqual(tokenizer.tokenize("外国人参政権"), ["外国", "##人", "参政", "##権"]) + + @require_sudachi_projection + def test_sudachi_tokenizer_projection(self): + tokenizer = SudachiTokenizer( + sudachi_dict_type="core", sudachi_split_mode="A", sudachi_projection="normalized_nouns" + ) + + self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "は", "猫", "です", "。"]) + + @require_sudachi_projection + def test_sudachi_full_tokenizer_with_sudachi_kwargs_sudachi_projection(self): + tokenizer = self.tokenizer_class( + self.vocab_file, word_tokenizer_type="sudachi", sudachi_kwargs={"sudachi_projection": "normalized_nouns"} + ) + + self.assertListEqual(tokenizer.tokenize("これはねこです。"), ["此れ", "は", "猫", "です", "。"]) + + @require_sudachi_projection def test_sudachi_tokenizer_lower(self): tokenizer = SudachiTokenizer(do_lower_case=True, sudachi_dict_type="core") self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),[" ", "\t", "アップル", "ストア", "で", "iphone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", " ", "。", " ", " "]) # fmt: skip - @require_sudachi + @require_sudachi_projection def test_sudachi_tokenizer_no_normalize(self): tokenizer = SudachiTokenizer(normalize_text=False, sudachi_dict_type="core") self.assertListEqual(tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "),[" ", "\t", "アップル", "ストア", "で", "iPhone", "8", " ", "が", " ", " ", "\n ", "発売", "さ", "れ", "た", "\u3000", "。", " ", " "]) # fmt: skip - @require_sudachi + @require_sudachi_projection def test_sudachi_tokenizer_trim_whitespace(self): tokenizer = SudachiTokenizer(trim_whitespace=True, sudachi_dict_type="core") @@ -293,6 +341,17 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "。"], ) + @require_jumanpp + def test_jumanpp_full_tokenizer_with_jumanpp_kwargs_trim_whitespace(self): + tokenizer = self.tokenizer_class( + self.vocab_file, word_tokenizer_type="jumanpp", jumanpp_kwargs={"trim_whitespace": True} + ) + + text = "こんにちは、世界。\nこんばんは、世界。" + tokens = tokenizer.tokenize(text) + self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14]) + @require_jumanpp def test_jumanpp_tokenizer_ext(self): tokenizer = JumanppTokenizer()