diff --git a/docs/source/index.rst b/docs/source/index.rst index a2ad13949d..6a100ed05c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -442,6 +442,7 @@ TensorFlow and/or Flax. model_doc/bert model_doc/bertweet model_doc/bertgeneration + model_doc/bert_japanese model_doc/bigbird model_doc/blenderbot model_doc/blenderbot_small diff --git a/docs/source/model_doc/bert_japanese.rst b/docs/source/model_doc/bert_japanese.rst new file mode 100644 index 0000000000..b078d4cba7 --- /dev/null +++ b/docs/source/model_doc/bert_japanese.rst @@ -0,0 +1,78 @@ +.. + Copyright 2020 The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + specific language governing permissions and limitations under the License. + +BertJapanese +----------------------------------------------------------------------------------------------------------------------- + +Overview +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The BERT models trained on Japanese text. + +There are models with two different tokenization methods: + +- Tokenize with MeCab and WordPiece. This requires some extra dependencies, `fugashi + `__ which is a wrapper around `MeCab `__. +- Tokenize into characters. + +To use `MecabTokenizer`, you should ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install +from source) to install dependencies. + +See `details on cl-tohoku repository `__. + +Example of using a model with MeCab and WordPiece tokenization: + +.. code-block:: + + >>> import torch + >>> from transformers import AutoModel, AutoTokenizer + + >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese") + >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese") + + >>> ## Input Japanese Text + >>> line = "吾輩は猫である。" + + >>> inputs = tokenizer(line, return_tensors="pt") + + >>> print(tokenizer.decode(inputs['input_ids'][0])) + [CLS] 吾輩 は 猫 で ある 。 [SEP] + + >>> outputs = bertjapanese(**inputs) + +Example of using a model with Character tokenization: + +.. code-block:: + + >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char") + >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char") + + >>> ## Input Japanese Text + >>> line = "吾輩は猫である。" + + >>> inputs = tokenizer(line, return_tensors="pt") + + >>> print(tokenizer.decode(inputs['input_ids'][0])) + [CLS] 吾 輩 は 猫 で あ る 。 [SEP] + + >>> outputs = bertjapanese(**inputs) + +Tips: + +- This implementation is the same as BERT, except for tokenization method. Refer to the :doc:`documentation of BERT + ` for more usage examples. + +BertJapaneseTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.BertJapaneseTokenizer + :members: diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index 995c944c35..242a75e702 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -70,7 +70,25 @@ PRETRAINED_INIT_CONFIGURATION = { class BertJapaneseTokenizer(BertTokenizer): - """BERT tokenizer for Japanese text""" + r""" + Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer. + + Args: + vocab_file (:obj:`str`): + Path to a one-wordpiece-per-line vocabulary file. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to lower case the input. Only has an effect when do_basic_tokenize=True. + do_word_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to do word tokenization. + do_subword_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to do subword tokenization. + word_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"basic"`): + Type of word tokenizer. + subword_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"wordpiece"`): + Type of subword tokenizer. + mecab_kwargs (:obj:`str`, `optional`): + Dictionary passed to the :obj:`MecabTokenizer` constructor. + """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP @@ -94,23 +112,6 @@ class BertJapaneseTokenizer(BertTokenizer): mecab_kwargs=None, **kwargs ): - """ - Constructs a MecabBertTokenizer. - - Args: - **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. - **do_lower_case**: (`optional`) boolean (default True) - Whether to lower case the input. Only has an effect when do_basic_tokenize=True. - **do_word_tokenize**: (`optional`) boolean (default True) - Whether to do word tokenization. - **do_subword_tokenize**: (`optional`) boolean (default True) - Whether to do subword tokenization. - **word_tokenizer_type**: (`optional`) string (default "basic") - Type of word tokenizer. - **subword_tokenizer_type**: (`optional`) string (default "wordpiece") - Type of subword tokenizer. - **mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None) - """ super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, @@ -230,7 +231,7 @@ class MecabTokenizer: import fugashi except ModuleNotFoundError as error: raise error.__class__( - "You need to install fugashi to use MecabTokenizer." + "You need to install fugashi to use MecabTokenizer. " "See https://pypi.org/project/fugashi/ for installation." )