From 7e73601f3240b99d952c34b63bf4f8b78ca1462d Mon Sep 17 00:00:00 2001 From: Fan Zhang Date: Tue, 1 Jun 2021 20:28:41 +0800 Subject: [PATCH] modify qa-trainer (#11872) * modify qa-trainer * fix flax model --- .../pytorch/question-answering/run_qa_no_trainer.py | 10 +++++++++- src/transformers/models/albert/modeling_albert.py | 4 ++-- src/transformers/models/bart/modeling_bart.py | 4 ++-- src/transformers/models/bert/modeling_bert.py | 4 ++-- src/transformers/models/big_bird/modeling_big_bird.py | 4 ++-- .../models/bigbird_pegasus/modeling_bigbird_pegasus.py | 4 ++-- src/transformers/models/convbert/modeling_convbert.py | 4 ++-- src/transformers/models/deberta/modeling_deberta.py | 4 ++-- .../models/deberta_v2/modeling_deberta_v2.py | 4 ++-- .../models/distilbert/modeling_distilbert.py | 4 ++-- src/transformers/models/dpr/modeling_dpr.py | 4 ++-- src/transformers/models/electra/modeling_electra.py | 4 ++-- src/transformers/models/funnel/modeling_funnel.py | 4 ++-- src/transformers/models/ibert/modeling_ibert.py | 4 ++-- src/transformers/models/led/modeling_led.py | 4 ++-- .../models/longformer/modeling_longformer.py | 4 ++-- src/transformers/models/mbart/modeling_mbart.py | 4 ++-- .../models/megatron_bert/modeling_megatron_bert.py | 4 ++-- .../models/mobilebert/modeling_mobilebert.py | 4 ++-- src/transformers/models/mpnet/modeling_mpnet.py | 4 ++-- src/transformers/models/reformer/modeling_reformer.py | 4 ++-- src/transformers/models/roberta/modeling_roberta.py | 4 ++-- .../models/squeezebert/modeling_squeezebert.py | 4 ++-- src/transformers/models/xlm/modeling_xlm.py | 4 ++-- src/transformers/models/xlnet/modeling_xlnet.py | 4 ++-- 25 files changed, 57 insertions(+), 49 deletions(-) diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index d0bb745785..e61a3a5227 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -692,7 +692,11 @@ def main(): if completed_steps >= args.max_train_steps: break - # Validation + # Evaluation + logger.info("***** Running Evaluation *****") + logger.info(f" Num examples = {len(eval_dataset)}") + logger.info(f" Batch size = {args.per_device_eval_batch_size}") + all_start_logits = [] all_end_logits = [] for step, batch in enumerate(eval_dataloader): @@ -725,6 +729,10 @@ def main(): # Prediction if args.do_predict: + logger.info("***** Running Prediction *****") + logger.info(f" Num examples = {len(predict_dataset)}") + logger.info(f" Batch size = {args.per_device_eval_batch_size}") + all_start_logits = [] all_end_logits = [] for step, batch in enumerate(predict_dataloader): diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index ca41ec2a22..9ba21cb990 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -1218,8 +1218,8 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 1c66f06a00..c9309f7023 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -1556,8 +1556,8 @@ class BartForQuestionAnswering(BartPretrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 75aadf2d90..c12207fcc7 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -1801,8 +1801,8 @@ class BertForQuestionAnswering(BertPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index 3d5e443e1c..3029884573 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -2983,8 +2983,8 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel): logits = logits - logits_mask * 1e6 start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 0c3860f85f..c6a41247c8 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -2761,8 +2761,8 @@ class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py index b6ac5abc02..ea79fb9653 100755 --- a/src/transformers/models/convbert/modeling_convbert.py +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -1293,8 +1293,8 @@ class ConvBertForQuestionAnswering(ConvBertPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 84989fda75..08a77183be 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -1364,8 +1364,8 @@ class DebertaForQuestionAnswering(DebertaPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 03563b02b9..f814f219ca 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -1488,8 +1488,8 @@ class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 3dc968cdf0..96fe25bafb 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -728,8 +728,8 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) # (bs, max_query_len) - end_logits = end_logits.squeeze(-1) # (bs, max_query_len) + start_logits = start_logits.squeeze(-1).contiguous() # (bs, max_query_len) + end_logits = end_logits.squeeze(-1).contiguous() # (bs, max_query_len) total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py index cb98c8fa81..35768d3c75 100644 --- a/src/transformers/models/dpr/modeling_dpr.py +++ b/src/transformers/models/dpr/modeling_dpr.py @@ -241,8 +241,8 @@ class DPRSpanPredictor(PreTrainedModel): # compute logits logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() relevance_logits = self.qa_classifier(sequence_output[:, 0, :]) # resize diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 4d8479942e..063f8df707 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -1318,8 +1318,8 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py index 428ce54fff..46f14e88f9 100644 --- a/src/transformers/models/funnel/modeling_funnel.py +++ b/src/transformers/models/funnel/modeling_funnel.py @@ -1549,8 +1549,8 @@ class FunnelForQuestionAnswering(FunnelPreTrainedModel): logits = self.qa_outputs(last_hidden_state) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index 3c72c2a17e..e30d24d5a3 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -1319,8 +1319,8 @@ class IBertForQuestionAnswering(IBertPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index 2541121a21..34d60dbb7e 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -2585,8 +2585,8 @@ class LEDForQuestionAnswering(LEDPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py index 4aa6f55687..6564a39065 100755 --- a/src/transformers/models/longformer/modeling_longformer.py +++ b/src/transformers/models/longformer/modeling_longformer.py @@ -2017,8 +2017,8 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 8e9b24499a..9b78ab897d 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -1563,8 +1563,8 @@ class MBartForQuestionAnswering(MBartPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index 49969c06b8..c40765bbf2 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -1794,8 +1794,8 @@ class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index a37f3e2833..e727d54912 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -1371,8 +1371,8 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index 90ba92242b..5bdc97b975 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -1023,8 +1023,8 @@ class MPNetForQuestionAnswering(MPNetPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index c19ac5265a..3a4cbddc18 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -2555,8 +2555,8 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index c3503c292a..2c7348a119 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -1472,8 +1472,8 @@ class RobertaForQuestionAnswering(RobertaPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py index 7fb76f0328..8bc786cd27 100644 --- a/src/transformers/models/squeezebert/modeling_squeezebert.py +++ b/src/transformers/models/squeezebert/modeling_squeezebert.py @@ -1068,8 +1068,8 @@ class SqueezeBertForQuestionAnswering(SqueezeBertPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py index bcf08ae410..3a47bcfe7d 100755 --- a/src/transformers/models/xlm/modeling_xlm.py +++ b/src/transformers/models/xlm/modeling_xlm.py @@ -941,8 +941,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py index 6f0eaa3f8c..5185b800cd 100755 --- a/src/transformers/models/xlnet/modeling_xlnet.py +++ b/src/transformers/models/xlnet/modeling_xlnet.py @@ -1862,8 +1862,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() total_loss = None if start_positions is not None and end_positions is not None: