From cf3cf304ca234ce70ffa223d7868e63133acd7ec Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 31 Jul 2020 17:41:14 +0900 Subject: [PATCH] Replace mecab-python3 with fugashi for Japanese tokenization (#6086) * Replace mecab-python3 with fugashi This replaces mecab-python3 with fugashi for Japanese tokenization. I am the maintainer of both projects. Both projects are MeCab wrappers, so the underlying C++ code is the same. fugashi is the newer wrapper and doesn't use SWIG, so for basic use of the MeCab API it's easier to use. This code insures the use of a version of ipadic installed via pip, which should make versioning and tracking down issues easier. fugashi has wheels for Windows, OSX, and Linux, which will help with issues with installing old versions of mecab-python3 on Windows. Compared to mecab-python3, because fugashi doesn't use SWIG, it doesn't require a C++ runtime to be installed on Windows. In adding this change I removed some code dealing with `cursor`, `token_start`, and `token_end` variables. These variables didn't seem to be used for anything, it is unclear to me why they were there. I ran the tests and they passed, though I couldn't figure out how to run the slow tests (`--runslow` gave an error) and didn't try testing with Tensorflow. * Style fix * Remove unused variable Forgot to delete this... * Adapt doc with install instructions * Fix typo Co-authored-by: sgugger Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- .circleci/config.yml | 2 +- docs/source/pretrained_models.rst | 10 +++++---- setup.cfg | 2 +- setup.py | 4 ++-- .../tokenization_bert_japanese.py | 21 ++++++++----------- 5 files changed, 19 insertions(+), 20 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4d76bb6ae2..100109539b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -56,7 +56,7 @@ jobs: RUN_CUSTOM_TOKENIZERS: yes steps: - checkout - - run: sudo pip install .[mecab,testing] + - run: sudo pip install .[ja,testing] - run: python -m pytest -s ./tests/test_tokenization_bert_japanese.py | tee output.txt - store_artifacts: path: ~/transformers/output.txt diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 27f048dc25..0ef6c976f1 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -74,14 +74,16 @@ For a list that includes community-uploaded models, refer to `https://huggingfac | | | (see `details on dbmdz repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``cl-tohoku/bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | -| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. | -| | | | `MeCab `__ is required for tokenization. | +| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece and this requires some extra dependencies, | +| | | | `fugashi `__ which is a wrapper around `MeCab `__. | +| | | | Use ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install from source) to install them. | | | | | | | | (see `details on cl-tohoku repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``cl-tohoku/bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | -| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. | -| | | | `MeCab `__ is required for tokenization. | +| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece and this requires some extra dependencies, | +| | | | `fugashi `__ which is a wrapper around `MeCab `__. | +| | | | Use ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install from source) to install them. | | | | | | | | (see `details on cl-tohoku repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/setup.cfg b/setup.cfg index d630f0b3a9..e5467ab623 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,10 +10,10 @@ known_third_party = faiss fastprogress fire + fugashi git h5py matplotlib - MeCab nlp nltk numpy diff --git a/setup.py b/setup.py index c4fc91ab31..206c3e3540 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ if stale_egg_info.exists(): extras = {} -extras["mecab"] = ["mecab-python3<1"] +extras["ja"] = ["fugashi>=1.0", "ipadic>=1.0,<2.0"] extras["sklearn"] = ["scikit-learn"] # keras2onnx and onnxconverter-common version is specific through a commit until 1.7.0 lands on pypi @@ -97,7 +97,7 @@ extras["quality"] = [ "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort", "flake8", ] -extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3<1", "scikit-learn", "tensorflow", "torch"] +extras["dev"] = extras["testing"] + extras["quality"] + extras["ja"] + ["scikit-learn", "tensorflow", "torch"] setup( name="transformers", diff --git a/src/transformers/tokenization_bert_japanese.py b/src/transformers/tokenization_bert_japanese.py index 43ae8e1894..c3ede2c47e 100644 --- a/src/transformers/tokenization_bert_japanese.py +++ b/src/transformers/tokenization_bert_japanese.py @@ -185,9 +185,14 @@ class MecabTokenizer: self.never_split = never_split if never_split is not None else [] self.normalize_text = normalize_text - import MeCab + import fugashi + import ipadic - self.mecab = MeCab.Tagger(mecab_option) if mecab_option is not None else MeCab.Tagger() + # Use ipadic by default (later options can override it) + mecab_option = mecab_option or "" + mecab_option = ipadic.MECAB_ARGS + " " + mecab_option + + self.mecab = fugashi.GenericTagger(mecab_option) def tokenize(self, text, never_split=None, **kwargs): """Tokenizes a piece of text.""" @@ -197,21 +202,13 @@ class MecabTokenizer: never_split = self.never_split + (never_split if never_split is not None else []) tokens = [] - mecab_output = self.mecab.parse(text) + for word in self.mecab(text): + token = word.surface - cursor = 0 - for line in mecab_output.split("\n"): - if line == "EOS": - break - - token, _ = line.split("\t") - token_start = text.index(token, cursor) - token_end = token_start + len(token) if self.do_lower_case and token not in never_split: token = token.lower() tokens.append(token) - cursor = token_end return tokens