Add documentation for BertJapanese (#11219)

* Start writing BERT-Japanese doc * Fix typo, Update toctree * Modify model file to use comment for document, Add examples * Clean bert_japanese by make style * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Split a big code block into two * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Add prefix >>> to all lines in code blocks * Clean bert_japanese by make fixup Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-04-13 22:49:15 +09:00
parent 896d7be974
commit 22fa0a6004
3 changed files with 99 additions and 19 deletions
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -442,6 +442,7 @@ TensorFlow and/or Flax.
    model_doc/bert
    model_doc/bertweet
    model_doc/bertgeneration
    model_doc/bert_japanese
    model_doc/bigbird
    model_doc/blenderbot
    model_doc/blenderbot_small
--- a/docs/source/model_doc/bert_japanese.rst
+++ b/docs/source/model_doc/bert_japanese.rst
@@ -0,0 +1,78 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 BertJapanese
 -----------------------------------------------------------------------------------------------------------------------
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The BERT models trained on Japanese text.
 There are models with two different tokenization methods:
 - Tokenize with MeCab and WordPiece. This requires some extra dependencies, `fugashi
  <https://github.com/polm/fugashi>`__ which is a wrapper around `MeCab <https://taku910.github.io/mecab/>`__.
 - Tokenize into characters.
 To use `MecabTokenizer`, you should ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install
 from source) to install dependencies.
 See `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__.
 Example of using a model with MeCab and WordPiece tokenization:
 .. code-block::
  >>> import torch
  >>> from transformers import AutoModel, AutoTokenizer 
  >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
  >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
  >>> ## Input Japanese Text
  >>> line = "吾輩は猫である。"
  >>> inputs = tokenizer(line, return_tensors="pt")
  >>> print(tokenizer.decode(inputs['input_ids'][0]))
  [CLS] 吾輩 は 猫 で ある 。 [SEP]
  >>> outputs = bertjapanese(**inputs)
 Example of using a model with Character tokenization:
 .. code-block::
  >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
  >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
  >>> ## Input Japanese Text
  >>> line = "吾輩は猫である。"
  >>> inputs = tokenizer(line, return_tensors="pt")
  >>> print(tokenizer.decode(inputs['input_ids'][0]))
  [CLS] 吾 輩 は 猫 で あ る 。 [SEP]
  >>> outputs = bertjapanese(**inputs)
 Tips:
 - This implementation is the same as BERT, except for tokenization method. Refer to the :doc:`documentation of BERT
  <bert>` for more usage examples.
 BertJapaneseTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BertJapaneseTokenizer
    :members: 
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -70,7 +70,25 @@ PRETRAINED_INIT_CONFIGURATION = {
 class BertJapaneseTokenizer(BertTokenizer):
-    """BERT tokenizer for Japanese text"""
+    r"""
    Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer.
    Args:
        vocab_file (:obj:`str`):
            Path to a one-wordpiece-per-line vocabulary file.
        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
        do_word_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether to do word tokenization.
        do_subword_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether to do subword tokenization.
        word_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"basic"`):
            Type of word tokenizer.
        subword_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"wordpiece"`):
            Type of subword tokenizer.
        mecab_kwargs (:obj:`str`, `optional`):
            Dictionary passed to the :obj:`MecabTokenizer` constructor.
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
@@ -94,23 +112,6 @@ class BertJapaneseTokenizer(BertTokenizer):
        mecab_kwargs=None,
        **kwargs
    ):
        """
        Constructs a MecabBertTokenizer.
        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
            **do_word_tokenize**: (`optional`) boolean (default True)
                Whether to do word tokenization.
            **do_subword_tokenize**: (`optional`) boolean (default True)
                Whether to do subword tokenization.
            **word_tokenizer_type**: (`optional`) string (default "basic")
                Type of word tokenizer.
            **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
                Type of subword tokenizer.
            **mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None)
        """
        super(BertTokenizer, self).__init__(
            unk_token=unk_token,
            sep_token=sep_token,
@@ -230,7 +231,7 @@ class MecabTokenizer:
            import fugashi
        except ModuleNotFoundError as error:
            raise error.__class__(
-                "You need to install fugashi to use MecabTokenizer."
+                "You need to install fugashi to use MecabTokenizer. "
                "See https://pypi.org/project/fugashi/ for installation."
            )