diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 2fe1f8a314..d3498e057d 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -61,6 +61,24 @@ Here is the full list of the currently provided pretrained models together with | | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | | | Trained on uncased German text by DBMDZ | | | | (see `details on dbmdz repository `__). | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. | +| | | | `MeCab `__ is required for tokenization. | +| | | (see `details on cl-tohoku repository `__). | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. | +| | | | `MeCab `__ is required for tokenization. | +| | | (see `details on cl-tohoku repository `__). | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-japanese-char`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on Japanese text. Text is tokenized into characters. | +| | | (see `details on cl-tohoku repository `__). | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. | +| | | (see `details on cl-tohoku repository `__). | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | | | OpenAI GPT English model | diff --git a/transformers/__init__.py b/transformers/__init__.py index f9a28add5f..5d7b0b772c 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -37,6 +37,7 @@ if is_sklearn_available(): from .tokenization_utils import (PreTrainedTokenizer) from .tokenization_auto import AutoTokenizer from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer +from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer from .tokenization_openai import OpenAIGPTTokenizer from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) from .tokenization_gpt2 import GPT2Tokenizer diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py index d63be963eb..16f1f60404 100644 --- a/transformers/configuration_bert.py +++ b/transformers/configuration_bert.py @@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", + 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-config.json", + 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-config.json", + 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-config.json", + 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-config.json" } diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index d84b0a1a7c..e2e115a015 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin", + 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-pytorch_model.bin", + 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-pytorch_model.bin", + 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-pytorch_model.bin", + 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-pytorch_model.bin" } @@ -1233,9 +1237,9 @@ class BertForQuestionAnswering(BertPreTrainedModel): question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]" input_ids = tokenizer.encode(input_text) - token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] + token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) - all_tokens = tokenizer.convert_ids_to_tokens(input_ids) + all_tokens = tokenizer.convert_ids_to_tokens(input_ids) print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) # a nice puppet diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py index 5aa7bb3da2..27dd311a4d 100644 --- a/transformers/modeling_tf_bert.py +++ b/transformers/modeling_tf_bert.py @@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5", 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5", 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5", + 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-tf_model.h5", + 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-tf_model.h5", + 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-tf_model.h5", + 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-tf_model.h5" } @@ -129,7 +133,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. - + Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ @@ -148,7 +152,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): input_shape = shape_list(input_ids) else: input_shape = shape_list(inputs_embeds)[:-1] - + seq_length = input_shape[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] @@ -246,7 +250,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer): context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) - context_layer = tf.reshape(context_layer, + context_layer = tf.reshape(context_layer, (batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) @@ -591,7 +595,7 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: - config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. + config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ @@ -605,13 +609,13 @@ BERT_INPUTS_DOCSTRING = r""" (a) For sequence pairs: ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` - + ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` (b) For single sequences: ``tokens: [CLS] the dog is hairy . [SEP]`` - + ``token_type_ids: 0 0 0 0 0 0 0`` Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py index b7c5046961..d63b7e783d 100644 --- a/transformers/tokenization_auto.py +++ b/transformers/tokenization_auto.py @@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging from .tokenization_bert import BertTokenizer +from .tokenization_bert_japanese import BertJapaneseTokenizer from .tokenization_openai import OpenAIGPTTokenizer from .tokenization_gpt2 import GPT2Tokenizer from .tokenization_ctrl import CTRLTokenizer @@ -118,6 +119,8 @@ class AutoTokenizer(object): return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'bert-japanese' in pretrained_model_name_or_path: + return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'bert' in pretrained_model_name_or_path: return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'openai-gpt' in pretrained_model_name_or_path: diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py new file mode 100644 index 0000000000..8554a1c880 --- /dev/null +++ b/transformers/tokenization_bert_japanese.py @@ -0,0 +1,247 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import logging +import os +import unicodedata +from io import open + +from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab +from .tokenization_utils import PreTrainedTokenizer + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-vocab.txt", + 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-vocab.txt", + 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-vocab.txt", + 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-vocab.txt" + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'bert-base-japanese': 512, + 'bert-base-japanese-whole-word-masking': 512, + 'bert-base-japanese-char': 512, + 'bert-base-japanese-char-whole-word-masking': 512 +} + +PRETRAINED_INIT_CONFIGURATION = { + 'bert-base-japanese': { + 'do_lower_case': False, + 'word_tokenizer_type': 'mecab', + 'subword_tokenizer_type': 'wordpiece' + }, + 'bert-base-japanese-whole-word-masking':{ + 'do_lower_case': False, + 'word_tokenizer_type': 'mecab', + 'subword_tokenizer_type': 'wordpiece' + }, + 'bert-base-japanese-char': { + 'do_lower_case': False, + 'word_tokenizer_type': 'mecab', + 'subword_tokenizer_type': 'character' + }, + 'bert-base-japanese-char-whole-word-masking': { + 'do_lower_case': False, + 'word_tokenizer_type': 'mecab', + 'subword_tokenizer_type': 'character' + } +} + + +class BertJapaneseTokenizer(BertTokenizer): + """BERT tokenizer for Japanese text""" + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, vocab_file, do_lower_case=False, + do_word_tokenize=True, do_subword_tokenize=True, + word_tokenizer_type='basic', subword_tokenizer_type='wordpiece', + never_split=None, unk_token='[UNK]', sep_token='[SEP]', + pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): + """Constructs a MecabBertTokenizer. + + Args: + **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. + **do_lower_case**: (`optional`) boolean (default True) + Whether to lower case the input. + Only has an effect when do_basic_tokenize=True. + **do_word_tokenize**: (`optional`) boolean (default True) + Whether to do word tokenization. + **do_subword_tokenize**: (`optional`) boolean (default True) + Whether to do subword tokenization. + **word_tokenizer_type**: (`optional`) string (default "basic") + Type of word tokenizer. + **subword_tokenizer_type**: (`optional`) string (default "wordpiece") + Type of subword tokenizer. + """ + super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, + pad_token=pad_token, cls_token=cls_token, + mask_token=mask_token, **kwargs) + self.max_len_single_sentence = self.max_len - 2 # take into account special tokens + self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()]) + + self.do_word_tokenize = do_word_tokenize + if do_word_tokenize: + if word_tokenizer_type == 'basic': + self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=False) + elif word_tokenizer_type == 'mecab': + self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case, + never_split=never_split) + else: + raise ValueError( + "Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type)) + + self.do_subword_tokenize = do_subword_tokenize + if do_subword_tokenize: + if subword_tokenizer_type == 'wordpiece': + self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, + unk_token=self.unk_token) + elif subword_tokenizer_type == 'character': + self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, + unk_token=self.unk_token) + else: + raise ValueError( + "Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type)) + + + def _tokenize(self, text): + if self.do_word_tokenize: + tokens = self.word_tokenizer.tokenize(text, + never_split=self.all_special_tokens) + else: + tokens = [text] + + if self.do_subword_tokenize: + split_tokens = [sub_token for token in tokens + for sub_token in self.subword_tokenizer.tokenize(token)] + else: + split_tokens = tokens + + return split_tokens + + +class MecabTokenizer(object): + """Runs basic tokenization with MeCab morphological parser.""" + + def __init__(self, do_lower_case=False, never_split=None, normalize_text=True): + """Constructs a MecabTokenizer. + + Args: + **do_lower_case**: (`optional`) boolean (default True) + Whether to lower case the input. + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + **normalize_text**: (`optional`) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + + import MeCab + self.mecab = MeCab.Tagger() + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize('NFKC', text) + + never_split = self.never_split + (never_split if never_split is not None else []) + tokens = [] + + cursor = 0 + for line in self.mecab.parse(text).split('\n'): + if line == 'EOS': + break + + token, _ = line.split('\t') + token_start = text.index(token, cursor) + token_end = token_start + len(token) + if self.do_lower_case and token not in never_split: + token = token.lower() + + tokens.append(token) + cursor = token_end + + return tokens + + +class CharacterTokenizer(object): + """Runs Character tokenziation.""" + + def __init__(self, vocab, unk_token, normalize_text=True): + """Constructs a CharacterTokenizer. + + Args: + **vocab**: + Vocabulary object. + **unk_token**: str + A special symbol for out-of-vocabulary token. + **normalize_text**: (`optional`) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + """ + self.vocab = vocab + self.unk_token = unk_token + self.normalize_text = normalize_text + + def tokenize(self, text): + """Tokenizes a piece of text into characters. + + For example: + input = "apple" + output = ["a", "p", "p", "l", "e"] + Args: + text: A single token or whitespace separated tokens. + This should have already been passed through `BasicTokenizer`. + Returns: + A list of characters. + """ + if self.normalize_text: + text = unicodedata.normalize('NFKC', text) + + output_tokens = [] + for i, char in enumerate(text): + if char not in self.vocab: + output_tokens.append(self.unk_token) + continue + + output_tokens.append(char) + + return output_tokens