From 857e0a0d3ba39be6259961524a730d3f106cec9c Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Thu, 5 Mar 2020 17:41:18 -0500 Subject: [PATCH] Rename BartForMaskedLM -> BartForConditionalGeneration (#3114) * improved documentation --- docs/source/model_doc/bart.rst | 31 +++++------ examples/summarization/bart/evaluate_cnn.py | 4 +- src/transformers/__init__.py | 6 ++- ..._original_pytorch_checkpoint_to_pytorch.py | 16 ++++-- src/transformers/modeling_auto.py | 11 ++-- src/transformers/modeling_bart.py | 54 ++++++++----------- tests/test_modeling_bart.py | 24 +++++---- 7 files changed, 75 insertions(+), 71 deletions(-) diff --git a/docs/source/model_doc/bart.rst b/docs/source/model_doc/bart.rst index c618b97df6..a988422b7b 100644 --- a/docs/source/model_doc/bart.rst +++ b/docs/source/model_doc/bart.rst @@ -7,7 +7,7 @@ file a `Github Issue `_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019. -According to the abstract: +According to the abstract, - Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a left-to-right decoder (like GPT). - The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme, where spans of text are replaced with a single mask token. @@ -18,26 +18,28 @@ The Authors' code can be found `here Tuple: return tuple(x for x in tup if isinstance(x, torch.Tensor) or x) -RET_DOCSTRING = r""" - Return: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. -""" # Public API @@ -863,10 +848,9 @@ class BartModel(PretrainedBartModel): @add_start_docstrings( - "The bare BART Model with a language modeling head. This is the model used for summarization.", - BART_START_DOCSTRING, + "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING, ) -class BartForMaskedLM(PretrainedBartModel): +class BartForConditionalGeneration(PretrainedBartModel): base_model_prefix = "model" def __init__(self, config: BartConfig): @@ -919,11 +903,18 @@ class BartForMaskedLM(PretrainedBartModel): Examples:: - tokenizer = BartTokenizer.from_pretrained('bart-large') - model = BartForMaskedLM.from_pretrained('bart-large') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 - outputs = model(input_ids=input_ids, lm_labels=input_ids) - loss, prediction_scores = outputs[:2] + # Mask filling only works for bart-large + from transformers import BartTokenizer, BartForConditionalGeneration + tokenizer = AutoTokenizer.from_pretrained('bart-large') + TXT = "My friends are but they eat too many carbs." + model = BartForConditionalGeneration.from_pretrained('bart-large') + input_ids = tokenizer.batch_encode_plus([TXT], return_tensors='pt')['input_ids'] + logits = model(input_ids)[0] + masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + probs = logits[0, masked_index].softmax(dim=0) + values, predictions = probs.topk(5) + tokenizer.decode(predictions).split() + # ['good', 'great', 'all', 'really', 'very'] """ outputs = self.model( input_ids, @@ -992,8 +983,7 @@ class BartForMaskedLM(PretrainedBartModel): min_len=0, no_repeat_ngram_size=0, ): - r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling - and beam-search. + r""" Generates summaries using the lm-head and greedy beam search Adapted in part from Facebook's `XLM beam search code`_ and `Fairseq beam search code`_. @@ -1031,16 +1021,16 @@ class BartForMaskedLM(PretrainedBartModel): sequence_length is <= max_length (examples can finish early) Examples:: - - config = BartConfig(vocab_size=50264, output_past=True) - model = AutoModelWithLMHead.from_pretrained('bart-large-cnn', config=config) - tokenizer = AutoTokenizer.from_pretrained('bart-large-cnn') + from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig + # see ``examples/summarization/bart/evaluate_cnn.py`` for a longer example + config = BartConfig(vocab_size=50264, output_past=True) # no mask_token_id + model = BartForConditionalGeneration.from_pretrained('bart-large-cnn', config=config) + tokenizer = BartTokenizer.from_pretrained('bart-large-cnn') ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt') # Generate Summary - generated_ids = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], num_beams=4, max_length=5) - print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated_ids]) - + summary_ids = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], num_beams=4, max_length=5) + print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]) """ bos_token_id = self.config.bos_token_id pad_token_id = self.config.pad_token_id diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py index f588d445b2..565335e69b 100644 --- a/tests/test_modeling_bart.py +++ b/tests/test_modeling_bart.py @@ -29,7 +29,7 @@ if is_torch_available(): from transformers import ( AutoModelForSequenceClassification, BartModel, - BartForMaskedLM, + BartForConditionalGeneration, BartForSequenceClassification, BartConfig, ) @@ -97,7 +97,9 @@ def prepare_bart_inputs_dict( @require_torch class BARTModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (BartModel, BartForMaskedLM, BartForSequenceClassification) if is_torch_available() else () + all_model_classes = ( + (BartModel, BartForConditionalGeneration, BartForSequenceClassification) if is_torch_available() else () + ) is_encoder_decoder = True # TODO(SS): fix the below in a separate PR test_pruning = False @@ -221,8 +223,8 @@ class BartHeadTests(unittest.TestCase): def test_lm_forward(self): config, input_ids, batch_size = self._get_config_and_data(output_past=False) - decoder_lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size) - lm_model = BartForMaskedLM(config) + decoder_lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size).to(torch_device) + lm_model = BartForConditionalGeneration(config) lm_model.to(torch_device) loss, logits, enc_features = lm_model.forward( input_ids=input_ids, lm_labels=decoder_lm_labels, decoder_input_ids=input_ids @@ -243,15 +245,15 @@ class BartHeadTests(unittest.TestCase): decoder_ffn_dim=32, max_position_embeddings=48, ) - lm_model = BartForMaskedLM(config) - context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long() - summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long() - logits, enc_features = lm_model.forward(input_ids=context, decoder_input_ids=summary) + lm_model = BartForConditionalGeneration(config).to(torch_device) + context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device) + summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long().to(torch_device) + loss, logits, enc_features = lm_model.forward(input_ids=context, decoder_input_ids=summary, lm_labels=summary) expected_shape = (*summary.shape, config.vocab_size) self.assertEqual(logits.shape, expected_shape) def test_generate_beam_search(self): - input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long() + input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long().to(torch_device) config = BartConfig( vocab_size=self.vocab_size, d_model=24, @@ -264,7 +266,7 @@ class BartHeadTests(unittest.TestCase): max_position_embeddings=48, output_past=True, ) - lm_model = BartForMaskedLM(config) + lm_model = BartForConditionalGeneration(config).to(torch_device) lm_model.eval() new_input_ids = lm_model.generate( @@ -376,7 +378,7 @@ class BartModelIntegrationTest(unittest.TestCase): @slow def test_cnn_summarization_same_as_fairseq(self): - hf = BartForMaskedLM.from_pretrained("bart-large-cnn", output_past=True,).to(torch_device) + hf = BartForConditionalGeneration.from_pretrained("bart-large-cnn", output_past=True,).to(torch_device) tok = BartTokenizer.from_pretrained("bart-large") text = " (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian" tokens = tok.encode(text, return_tensors="pt").to(torch_device)