From 4374eaea786399a948b4b34d39fc614ded5e1de6 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Wed, 6 Nov 2019 20:47:42 +0000 Subject: [PATCH] ALBERT for SQuAD --- examples/run_squad.py | 12 +++-- transformers/__init__.py | 1 + transformers/modeling_albert.py | 93 ++++++++++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 6 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 69088d73c3..59683c0668 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -43,7 +43,8 @@ from transformers import (WEIGHTS_NAME, BertConfig, XLMTokenizer, XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer, - DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) + DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer, + AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer) from transformers import AdamW, get_linear_schedule_with_warmup @@ -65,7 +66,8 @@ MODEL_CLASSES = { 'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer), 'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) + 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), + 'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer) } def set_seed(args): @@ -128,7 +130,7 @@ def train(args, train_dataset, model, tokenizer): logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) - global_step = 0 + global_step = 1 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) @@ -537,7 +539,7 @@ def main(): torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned - model = model_class.from_pretrained(args.output_dir) + model = model_class.from_pretrained(args.output_dir, force_download=True) tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) @@ -555,7 +557,7 @@ def main(): for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" - model = model_class.from_pretrained(checkpoint) + model = model_class.from_pretrained(checkpoint, force_download=True) model.to(args.device) # Evaluate diff --git a/transformers/__init__.py b/transformers/__init__.py index 51995942ce..81e659329d 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -108,6 +108,7 @@ if is_torch_available(): from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model from .modeling_albert import (AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, + AlbertForQuestionAnswering, load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) # Optimization diff --git a/transformers/modeling_albert.py b/transformers/modeling_albert.py index 51cb0a6d23..2540218e69 100644 --- a/transformers/modeling_albert.py +++ b/transformers/modeling_albert.py @@ -586,4 +586,95 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs - return outputs # (loss), logits, (hidden_states), (attentions) \ No newline at end of file + return outputs # (loss), logits, (hidden_states), (attentions) + + + +@add_start_docstrings("""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING) +class AlbertForQuestionAnswering(AlbertPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') + model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2') + question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" + input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]" + input_ids = tokenizer.encode(input_text) + token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] + start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) + all_tokens = tokenizer.convert_ids_to_tokens(input_ids) + print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) + # a nice puppet + + + """ + def __init__(self, config): + super(AlbertForQuestionAnswering, self).__init__(config) + self.num_labels = config.num_labels + + self.albert = AlbertModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + start_positions=None, end_positions=None): + + outputs = self.albert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)