diff --git a/.circleci/config.yml b/.circleci/config.yml index 4d76bb6ae2..100109539b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -56,7 +56,7 @@ jobs: RUN_CUSTOM_TOKENIZERS: yes steps: - checkout - - run: sudo pip install .[mecab,testing] + - run: sudo pip install .[ja,testing] - run: python -m pytest -s ./tests/test_tokenization_bert_japanese.py | tee output.txt - store_artifacts: path: ~/transformers/output.txt diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 27f048dc25..0ef6c976f1 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -74,14 +74,16 @@ For a list that includes community-uploaded models, refer to `https://huggingfac | | | (see `details on dbmdz repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``cl-tohoku/bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | -| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. | -| | | | `MeCab `__ is required for tokenization. | +| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece and this requires some extra dependencies, | +| | | | `fugashi `__ which is a wrapper around `MeCab `__. | +| | | | Use ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install from source) to install them. | | | | | | | | (see `details on cl-tohoku repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``cl-tohoku/bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | -| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. | -| | | | `MeCab `__ is required for tokenization. | +| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece and this requires some extra dependencies, | +| | | | `fugashi `__ which is a wrapper around `MeCab `__. | +| | | | Use ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install from source) to install them. | | | | | | | | (see `details on cl-tohoku repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/setup.cfg b/setup.cfg index d630f0b3a9..e5467ab623 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,10 +10,10 @@ known_third_party = faiss fastprogress fire + fugashi git h5py matplotlib - MeCab nlp nltk numpy diff --git a/setup.py b/setup.py index c4fc91ab31..206c3e3540 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ if stale_egg_info.exists(): extras = {} -extras["mecab"] = ["mecab-python3<1"] +extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0,<2.0"] extras["sklearn"] = ["scikit-learn"] # keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi @@ -97,7 +97,7 @@ extras["quality"] = [ "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort", "flake8", ] -extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3<1", "scikit-learn", "tensorflow", "torch"] +extras["dev"] = extras["testing"] + extras["quality"] + extras["ja"] + ["scikit-learn", "tensorflow", "torch"] setup( name="transformers", diff --git a/src/transformers/tokenization_bert_japanese.py b/src/transformers/tokenization_bert_japanese.py index 43ae8e1894..c3ede2c47e 100644 --- a/src/transformers/tokenization_bert_japanese.py +++ b/src/transformers/tokenization_bert_japanese.py @@ -185,9 +185,14 @@ class MecabTokenizer: self.never_split = never_split if never_split is not None else [] self.normalize_text = normalize_text - import MeCab + import fugashi + import ipadic - self.mecab = MeCab.Tagger(mecab_option) if mecab_option is not None else MeCab.Tagger() + # Use ipadic by default (later options can override it) + mecab_option = mecab_option or "" + mecab_option = ipadic.MECAB_ARGS + " " + mecab_option + + self.mecab = fugashi.GenericTagger(mecab_option) def tokenize(self, text, never_split=None, **kwargs): """Tokenizes a piece of text.""" @@ -197,21 +202,13 @@ class MecabTokenizer: never_split = self.never_split + (never_split if never_split is not None else []) tokens = [] - mecab_output = self.mecab.parse(text) + for word in self.mecab(text): + token = word.surface - cursor = 0 - for line in mecab_output.split("\n"): - if line == "EOS": - break - - token, _ = line.split("\t") - token_start = text.index(token, cursor) - token_end = token_start + len(token) if self.do_lower_case and token not in never_split: token = token.lower() tokens.append(token) - cursor = token_end return tokens