Add documentation for BertJapanese (#11219)
* Start writing BERT-Japanese doc * Fix typo, Update toctree * Modify model file to use comment for document, Add examples * Clean bert_japanese by make style * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Split a big code block into two * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Add prefix >>> to all lines in code blocks * Clean bert_japanese by make fixup Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -442,6 +442,7 @@ TensorFlow and/or Flax.
|
|||||||
model_doc/bert
|
model_doc/bert
|
||||||
model_doc/bertweet
|
model_doc/bertweet
|
||||||
model_doc/bertgeneration
|
model_doc/bertgeneration
|
||||||
|
model_doc/bert_japanese
|
||||||
model_doc/bigbird
|
model_doc/bigbird
|
||||||
model_doc/blenderbot
|
model_doc/blenderbot
|
||||||
model_doc/blenderbot_small
|
model_doc/blenderbot_small
|
||||||
|
|||||||
78
docs/source/model_doc/bert_japanese.rst
Normal file
78
docs/source/model_doc/bert_japanese.rst
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
..
|
||||||
|
Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
|
||||||
|
BertJapanese
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
Overview
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The BERT models trained on Japanese text.
|
||||||
|
|
||||||
|
There are models with two different tokenization methods:
|
||||||
|
|
||||||
|
- Tokenize with MeCab and WordPiece. This requires some extra dependencies, `fugashi
|
||||||
|
<https://github.com/polm/fugashi>`__ which is a wrapper around `MeCab <https://taku910.github.io/mecab/>`__.
|
||||||
|
- Tokenize into characters.
|
||||||
|
|
||||||
|
To use `MecabTokenizer`, you should ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install
|
||||||
|
from source) to install dependencies.
|
||||||
|
|
||||||
|
See `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__.
|
||||||
|
|
||||||
|
Example of using a model with MeCab and WordPiece tokenization:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
>>> import torch
|
||||||
|
>>> from transformers import AutoModel, AutoTokenizer
|
||||||
|
|
||||||
|
>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
|
||||||
|
>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
|
||||||
|
|
||||||
|
>>> ## Input Japanese Text
|
||||||
|
>>> line = "吾輩は猫である。"
|
||||||
|
|
||||||
|
>>> inputs = tokenizer(line, return_tensors="pt")
|
||||||
|
|
||||||
|
>>> print(tokenizer.decode(inputs['input_ids'][0]))
|
||||||
|
[CLS] 吾輩 は 猫 で ある 。 [SEP]
|
||||||
|
|
||||||
|
>>> outputs = bertjapanese(**inputs)
|
||||||
|
|
||||||
|
Example of using a model with Character tokenization:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
|
||||||
|
>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
|
||||||
|
|
||||||
|
>>> ## Input Japanese Text
|
||||||
|
>>> line = "吾輩は猫である。"
|
||||||
|
|
||||||
|
>>> inputs = tokenizer(line, return_tensors="pt")
|
||||||
|
|
||||||
|
>>> print(tokenizer.decode(inputs['input_ids'][0]))
|
||||||
|
[CLS] 吾 輩 は 猫 で あ る 。 [SEP]
|
||||||
|
|
||||||
|
>>> outputs = bertjapanese(**inputs)
|
||||||
|
|
||||||
|
Tips:
|
||||||
|
|
||||||
|
- This implementation is the same as BERT, except for tokenization method. Refer to the :doc:`documentation of BERT
|
||||||
|
<bert>` for more usage examples.
|
||||||
|
|
||||||
|
BertJapaneseTokenizer
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.BertJapaneseTokenizer
|
||||||
|
:members:
|
||||||
@@ -70,7 +70,25 @@ PRETRAINED_INIT_CONFIGURATION = {
|
|||||||
|
|
||||||
|
|
||||||
class BertJapaneseTokenizer(BertTokenizer):
|
class BertJapaneseTokenizer(BertTokenizer):
|
||||||
"""BERT tokenizer for Japanese text"""
|
r"""
|
||||||
|
Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`str`):
|
||||||
|
Path to a one-wordpiece-per-line vocabulary file.
|
||||||
|
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
|
||||||
|
do_word_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to do word tokenization.
|
||||||
|
do_subword_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to do subword tokenization.
|
||||||
|
word_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"basic"`):
|
||||||
|
Type of word tokenizer.
|
||||||
|
subword_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"wordpiece"`):
|
||||||
|
Type of subword tokenizer.
|
||||||
|
mecab_kwargs (:obj:`str`, `optional`):
|
||||||
|
Dictionary passed to the :obj:`MecabTokenizer` constructor.
|
||||||
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
@@ -94,23 +112,6 @@ class BertJapaneseTokenizer(BertTokenizer):
|
|||||||
mecab_kwargs=None,
|
mecab_kwargs=None,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""
|
|
||||||
Constructs a MecabBertTokenizer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
|
|
||||||
**do_lower_case**: (`optional`) boolean (default True)
|
|
||||||
Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
|
|
||||||
**do_word_tokenize**: (`optional`) boolean (default True)
|
|
||||||
Whether to do word tokenization.
|
|
||||||
**do_subword_tokenize**: (`optional`) boolean (default True)
|
|
||||||
Whether to do subword tokenization.
|
|
||||||
**word_tokenizer_type**: (`optional`) string (default "basic")
|
|
||||||
Type of word tokenizer.
|
|
||||||
**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
|
|
||||||
Type of subword tokenizer.
|
|
||||||
**mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None)
|
|
||||||
"""
|
|
||||||
super(BertTokenizer, self).__init__(
|
super(BertTokenizer, self).__init__(
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
sep_token=sep_token,
|
sep_token=sep_token,
|
||||||
@@ -230,7 +231,7 @@ class MecabTokenizer:
|
|||||||
import fugashi
|
import fugashi
|
||||||
except ModuleNotFoundError as error:
|
except ModuleNotFoundError as error:
|
||||||
raise error.__class__(
|
raise error.__class__(
|
||||||
"You need to install fugashi to use MecabTokenizer."
|
"You need to install fugashi to use MecabTokenizer. "
|
||||||
"See https://pypi.org/project/fugashi/ for installation."
|
"See https://pypi.org/project/fugashi/ for installation."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user