From 4f3a54bfc8fa8749f6d5b29f110148738a646fcd Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 31 Oct 2019 16:37:34 +0000 Subject: [PATCH] ALBERT can load pre-trained models. Doesn't inherit from BERT anymore. --- transformers/__init__.py | 2 +- transformers/configuration_albert.py | 9 ++++++ transformers/modeling_albert.py | 44 +++++++++++++++++++++++----- transformers/tokenization_albert.py | 25 +++++++++++++--- 4 files changed, 68 insertions(+), 12 deletions(-) diff --git a/transformers/__init__.py b/transformers/__init__.py index 152d520e7b..bdfb1a0922 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -107,7 +107,7 @@ if is_torch_available(): CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model - from .modeling_albert import (AlbertModel, AlbertForMaskedLM) + from .modeling_albert import (AlbertModel, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) # Optimization from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup, diff --git a/transformers/configuration_albert.py b/transformers/configuration_albert.py index b72bbb971e..c35426768f 100644 --- a/transformers/configuration_albert.py +++ b/transformers/configuration_albert.py @@ -17,12 +17,21 @@ from .configuration_utils import PretrainedConfig +ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'albert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json", + 'albert-large': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json", + 'albert-xlarge': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json", + 'albert-xxlarge': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json", +} + class AlbertConfig(PretrainedConfig): """Configuration for `AlbertModel`. The default settings match the configuration of model `albert_xxlarge`. """ + pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP + def __init__(self, vocab_size_or_config_json_file=30000, embedding_size=128, diff --git a/transformers/modeling_albert.py b/transformers/modeling_albert.py index 487455e561..4da10ed1cb 100644 --- a/transformers/modeling_albert.py +++ b/transformers/modeling_albert.py @@ -21,6 +21,7 @@ import logging import torch import torch.nn as nn from torch.nn import CrossEntropyLoss +from transformers.modeling_utils import PreTrainedModel from transformers.configuration_albert import AlbertConfig from transformers.modeling_bert import BertEmbeddings, BertPreTrainedModel, BertModel, BertSelfAttention, prune_linear_layer, ACT2FN from .file_utils import add_start_docstrings @@ -274,6 +275,29 @@ class AlbertTransformer(nn.Module): return outputs # last-layer hidden state, (all hidden states), (all attentions) + +class AlbertPreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = AlbertConfig + pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "albert" + + def _init_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if isinstance(module, (nn.Linear)) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + ALBERT_START_DOCSTRING = r""" The ALBERT model was proposed in `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents @@ -338,7 +362,7 @@ ALBERT_INPUTS_DOCSTRING = r""" @add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING) -class AlbertModel(BertModel): +class AlbertModel(AlbertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` @@ -358,6 +382,12 @@ class AlbertModel(BertModel): list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ + + config_class = AlbertConfig + pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = load_tf_weights_in_albert + base_model_prefix = "albert" + def __init__(self, config): super(AlbertModel, self).__init__(config) @@ -369,6 +399,11 @@ class AlbertModel(BertModel): self.init_weights() + def _resize_token_embeddings(self, new_num_tokens): + old_embeddings = self.embeddings.word_embeddings + new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) + self.embeddings.word_embeddings = new_embeddings + return self.embeddings.word_embeddings def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None): if attention_mask is None: @@ -423,7 +458,7 @@ class AlbertMLMHead(nn.Module): @add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING) -class AlbertForMaskedLM(BertPreTrainedModel): +class AlbertForMaskedLM(AlbertPreTrainedModel): r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. @@ -445,11 +480,6 @@ class AlbertForMaskedLM(BertPreTrainedModel): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - config_class = AlbertConfig - pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP - load_tf_weights = load_tf_weights_in_albert - base_model_prefix = "albert" - def __init__(self, config): super(AlbertForMaskedLM, self).__init__(config) diff --git a/transformers/tokenization_albert.py b/transformers/tokenization_albert.py index 7cba99b9e4..acf67c1154 100644 --- a/transformers/tokenization_albert.py +++ b/transformers/tokenization_albert.py @@ -15,7 +15,7 @@ """ Tokenization classes for ALBERT model.""" from __future__ import (absolute_import, division, print_function, unicode_literals) - + from .tokenization_utils import PreTrainedTokenizer import logging import unicodedata @@ -24,8 +24,25 @@ import os from shutil import copyfile logger = logging.getLogger(__name__) - VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'albert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model", + 'albert-large': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model", + 'albert-xlarge': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model", + 'albert-xxlarge': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'albert-base': 512, + 'albert-large': 512, + 'albert-xlarge': 512, + 'albert-xxlarge': 512, +} + SPIECE_UNDERLINE = u'▁' class AlbertTokenizer(PreTrainedTokenizer): @@ -35,8 +52,8 @@ class AlbertTokenizer(PreTrainedTokenizer): - requires `SentencePiece `_ """ vocab_files_names = VOCAB_FILES_NAMES - # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, vocab_file, do_lower_case=True, remove_space=True, keep_accents=False,