From c4d4e8bdbd25d9463d41de6398940329c89b7fb6 Mon Sep 17 00:00:00 2001 From: Yacine Jernite Date: Tue, 30 Jun 2020 10:42:08 -0400 Subject: [PATCH] Move GenerationMixin to separate file (#5254) * separate_generation_code * isort * renamed * rename_files * move_shapelit --- src/transformers/__init__.py | 5 +- src/transformers/generation_tf_utils.py | 1093 +++++++++++++++++++++++ src/transformers/generation_utils.py | 989 ++++++++++++++++++++ src/transformers/modeling_tf_utils.py | 1061 +--------------------- src/transformers/modeling_utils.py | 966 +------------------- 5 files changed, 2090 insertions(+), 2024 deletions(-) create mode 100644 src/transformers/generation_tf_utils.py create mode 100644 src/transformers/generation_utils.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 453621a2ed..335ac94175 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -169,7 +169,8 @@ if is_sklearn_available(): # Modeling if is_torch_available(): - from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering, apply_chunking_to_forward + from .generation_utils import top_k_top_p_filtering + from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, apply_chunking_to_forward from .modeling_auto import ( AutoModel, AutoModelForPreTraining, @@ -406,9 +407,9 @@ if is_torch_available(): # TensorFlow if is_tf_available(): + from .generation_tf_utils import tf_top_k_top_p_filtering from .modeling_tf_utils import ( shape_list, - tf_top_k_top_p_filtering, TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py new file mode 100644 index 0000000000..675ce26782 --- /dev/null +++ b/src/transformers/generation_tf_utils.py @@ -0,0 +1,1093 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import numpy as np +import tensorflow as tf + + +logger = logging.getLogger(__name__) + + +class TFGenerationMixin: + """ + A class contraining all of the functions supporting generation, to be used as a mixin in TFPreTrainedModel. + """ + + def prepare_inputs_for_generation(self, inputs, **kwargs): + return {"inputs": inputs} + + def _use_cache(self, outputs, use_cache): + """During generation, decide whether to pass the `past` variable to the next forward pass.""" + if len(outputs) <= 1 or use_cache is False: + return False + if hasattr(self.config, "mem_len") and self.config.mem_len == 0: + return False + return True + + def generate( + self, + input_ids=None, + max_length=None, + min_length=None, + do_sample=None, + early_stopping=None, + num_beams=None, + temperature=None, + top_k=None, + top_p=None, + repetition_penalty=None, + bad_words_ids=None, + bos_token_id=None, + pad_token_id=None, + eos_token_id=None, + length_penalty=None, + no_repeat_ngram_size=None, + num_return_sequences=None, + attention_mask=None, + decoder_start_token_id=None, + use_cache=None, + ): + r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling + and beam-search. + + Adapted in part from `Facebook's XLM beam search code`_. + + .. _`Facebook's XLM beam search code`: + https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 + + + Parameters: + + input_ids: (`optional`) `tf.Tensor` of `dtype=tf.int32` of shape `(batch_size, sequence_length)` + The sequence used as a prompt for the generation. If `None` the method initializes + it as an empty `tf.Tensor` of shape `(1,)`. + + max_length: (`optional`) int + The max length of the sequence to be generated. Between 1 and infinity. Default to 20. + + min_length: (`optional`) int + The min length of the sequence to be generated. Between 0 and infinity. Default to 0. + do_sample: (`optional`) bool + If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. + + early_stopping: (`optional`) bool + if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. + + num_beams: (`optional`) int + Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. + + temperature: (`optional`) float + The value used to module the next token probabilities. Must be strictely positive. Default to 1.0. + + top_k: (`optional`) int + The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. + + top_p: (`optional`) float + The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. + + repetition_penalty: (`optional`) float + The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. + + bos_token_id: (`optional`) int + Beginning of sentence token if no prompt is provided. Default to specicic model bos_token_id or None if it does not exist. + + pad_token_id: (`optional`) int + Pad token. Defaults to pad_token_id as defined in the models config. + + eos_token_id: (`optional`) int + EOS token. Defaults to eos_token_id as defined in the models config. + + length_penalty: (`optional`) float + Exponential penalty to the length. Default to 1. + + no_repeat_ngram_size: (`optional`) int + If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. + + bad_words_ids: (`optional`) list of lists of int + `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. + + num_return_sequences: (`optional`) int + The number of independently computed returned sequences for each element in the batch. Default to 1. + + attention_mask (`optional`) obj: `tf.Tensor` with `dtype=tf.int32` of same shape as `input_ids` + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + Defaults to `None`. + + `What are attention masks? <../glossary.html#attention-mask>`__ + + decoder_start_token_id=None: (`optional`) int + If an encoder-decoder model starts decoding with a different token than BOS. + Defaults to `None` and is changed to `BOS` later. + + use_cache: (`optional`) bool + If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`. + + Return: + + output: `tf.Tensor` of `dtype=tf.int32` shape `(batch_size * num_return_sequences, sequence_length)` + sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` + + Examples:: + + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + outputs = model.generate(max_length=40) # do greedy decoding + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context + outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. + input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl + input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer + model = TFAutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. + input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl + bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] + input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated + """ + + # We cannot generate if the model does not have a LM head + if self.get_output_embeddings() is None: + raise AttributeError( + "You tried to generate sequences with a model that does not have a LM Head." + "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)" + ) + + max_length = max_length if max_length is not None else self.config.max_length + min_length = min_length if min_length is not None else self.config.min_length + do_sample = do_sample if do_sample is not None else self.config.do_sample + early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping + use_cache = use_cache if use_cache is not None else self.config.use_cache + num_beams = num_beams if num_beams is not None else self.config.num_beams + temperature = temperature if temperature is not None else self.config.temperature + top_k = top_k if top_k is not None else self.config.top_k + top_p = top_p if top_p is not None else self.config.top_p + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty + bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty + no_repeat_ngram_size = ( + no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size + ) + bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids + num_return_sequences = ( + num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences + ) + decoder_start_token_id = ( + decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id + ) + + if input_ids is not None: + batch_size = shape_list(input_ids)[0] # overriden by the input batch_size + else: + batch_size = 1 + + assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer." + assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." + assert isinstance(do_sample, bool), "`do_sample` should be a boolean." + assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." + assert isinstance(use_cache, bool), "`use_cache` should be a boolean." + assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer." + assert temperature > 0, "`temperature` should be strictely positive." + assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." + assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." + assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." + assert input_ids is not None or ( + isinstance(bos_token_id, int) and bos_token_id >= 0 + ), "If input_ids is not defined, `bos_token_id` should be a positive integer." + assert pad_token_id is None or ( + isinstance(pad_token_id, int) and (pad_token_id >= 0) + ), "`pad_token_id` should be a positive integer." + assert (eos_token_id is None) or ( + isinstance(eos_token_id, int) and (eos_token_id >= 0) + ), "`eos_token_id` should be a positive integer." + assert length_penalty > 0, "`length_penalty` should be strictely positive." + assert ( + isinstance(num_return_sequences, int) and num_return_sequences > 0 + ), "`num_return_sequences` should be a strictely positive integer." + assert ( + bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) + ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" + + if input_ids is None: + assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( + "you should either supply a context to complete as `input_ids` input " + "or a `bos_token_id` (integer >= 0) as a first token to start the generation." + ) + input_ids = tf.fill((batch_size, 1), bos_token_id) + else: + assert len(shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)." + + # not allow to duplicate outputs when greedy decoding + if do_sample is False: + if num_beams == 1: + # no_beam_search greedy generation conditions + assert ( + num_return_sequences == 1 + ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" + + else: + # beam_search greedy generation conditions + assert ( + num_beams >= num_return_sequences + ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" + + # create attention mask if necessary + # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 + if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): + attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) + elif attention_mask is None: + attention_mask = tf.ones_like(input_ids) + + if pad_token_id is None and eos_token_id is not None: + logger.warning( + "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) + ) + pad_token_id = eos_token_id + + # current position and vocab size + cur_len = shape_list(input_ids)[1] + vocab_size = self.config.vocab_size + + # set effective batch size and effective batch multiplier according to do_sample + if do_sample: + effective_batch_size = batch_size * num_return_sequences + effective_batch_mult = num_return_sequences + else: + effective_batch_size = batch_size + effective_batch_mult = 1 + + if self.config.is_encoder_decoder: + if decoder_start_token_id is None: + decoder_start_token_id = bos_token_id + + assert ( + decoder_start_token_id is not None + ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" + assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) + assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) + + # get encoder and store encoder outputs + encoder = self.get_encoder() + + encoder_outputs = encoder(input_ids, attention_mask=attention_mask) + + # Expand input ids if num_beams > 1 or num_return_sequences > 1 + if num_return_sequences > 1 or num_beams > 1: + input_ids_len = shape_list(input_ids)[-1] + input_ids = tf.broadcast_to( + tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) + ) + attention_mask = tf.broadcast_to( + tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) + ) + input_ids = tf.reshape( + input_ids, (effective_batch_size * num_beams, input_ids_len) + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + attention_mask = tf.reshape( + attention_mask, (effective_batch_size * num_beams, input_ids_len) + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + + if self.config.is_encoder_decoder: + + # create empty decoder_input_ids + input_ids = tf.ones((effective_batch_size * num_beams, 1), dtype=tf.int32,) * decoder_start_token_id + cur_len = 1 + + assert ( + batch_size == encoder_outputs[0].shape[0] + ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " + + # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) + expanded_batch_idxs = tf.reshape( + tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), + shape=(-1,), + ) + # expand encoder_outputs + encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0), *encoder_outputs[1:]) + + else: + encoder_outputs = None + cur_len = shape_list(input_ids)[-1] + + if num_beams > 1: + output = self._generate_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + early_stopping=early_stopping, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + bos_token_id=bos_token_id, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + batch_size=effective_batch_size, + num_return_sequences=num_return_sequences, + length_penalty=length_penalty, + num_beams=num_beams, + vocab_size=vocab_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + use_cache=use_cache, + ) + else: + output = self._generate_no_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + bos_token_id=bos_token_id, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + batch_size=effective_batch_size, + vocab_size=vocab_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + use_cache=use_cache, + ) + + return output + + def _generate_no_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + bos_token_id, + pad_token_id, + eos_token_id, + decoder_start_token_id, + batch_size, + vocab_size, + encoder_outputs, + attention_mask, + use_cache, + ): + """ Generate sequences for each example without beam search (num_beams == 1). + All returned sequence are generated independantly. + """ + + # length of generated sentences / unfinished sentences + unfinished_sents = tf.ones_like(input_ids[:, 0]) + sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length + + past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache + ) + outputs = self(**model_inputs) + next_token_logits = outputs[0][:, -1, :] + + # if model has past, then set the past variable to speed up decoding + if self._use_cache(outputs, use_cache): + past = outputs[1] + + # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + next_token_logits_penalties = _create_next_token_logits_penalties( + input_ids, next_token_logits, repetition_penalty + ) + next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) + + if no_repeat_ngram_size > 0: + # calculate a list of banned tokens to prevent repetitively generating the same ngrams + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) + # create banned_tokens boolean mask + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + next_token_logits = set_tensor_by_indices_to_value( + next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + if bad_words_ids is not None: + # calculate a list of banned tokens according to bad words + banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) + + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + next_token_logits = set_tensor_by_indices_to_value( + next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + # set eos token prob to zero if min_length is not reached + if eos_token_id is not None and cur_len < min_length: + # create eos_token_id boolean mask + is_token_logit_eos_token = tf.convert_to_tensor( + [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool + ) + eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [batch_size, vocab_size]) + + next_token_logits = set_tensor_by_indices_to_value( + next_token_logits, eos_token_indices_mask, -float("inf") + ) + + if do_sample: + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + next_token_logits = next_token_logits / temperature + # Top-p/top-k filtering + next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) + # Sample + next_token = tf.squeeze( + tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1 + ) + else: + # Greedy decoding + next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32) + + # update generations and finished sentences + if eos_token_id is not None: + # pad finished sentences if eos_token_id exist + tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) + else: + tokens_to_add = next_token + + # add token and increase length by one + input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1) + cur_len = cur_len + 1 + + if eos_token_id is not None: + eos_in_sents = tokens_to_add == eos_token_id + # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length + is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( + unfinished_sents, tf.cast(eos_in_sents, tf.int32) + ) + sent_lengths = ( + sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos) + + cur_len * is_sents_unfinished_and_token_to_add_is_eos + ) + + # unfinished_sents is set to zero if eos in sentence + unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos + + # stop when there is a in each sentence, or if we exceed the maximul length + if tf.math.reduce_max(unfinished_sents) == 0: + break + + # extend attention_mask for new generated input if only decoder + if self.config.is_encoder_decoder is False: + attention_mask = tf.concat( + [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 + ) + + # if there are different sentences lengths in the batch, some batches have to be padded + min_sent_length = tf.math.reduce_min(sent_lengths) + max_sent_length = tf.math.reduce_max(sent_lengths) + if min_sent_length != max_sent_length: + assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" + # finished sents are filled with pad_token + padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id + + # create length masks for tf.where operation + broad_casted_sent_lengths = tf.broadcast_to( + tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length] + ) + broad_casted_range = tf.transpose( + tf.broadcast_to(tf.expand_dims(tf.range(max_sent_length), -1), [max_sent_length, batch_size]) + ) + + decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding) + else: + decoded = input_ids + + return decoded + + def _generate_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + early_stopping, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + bos_token_id, + pad_token_id, + decoder_start_token_id, + eos_token_id, + batch_size, + num_return_sequences, + length_penalty, + num_beams, + vocab_size, + encoder_outputs, + attention_mask, + use_cache, + ): + """ Generate sequences for each example with beam search. + """ + + # generated hypotheses + generated_hyps = [ + BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) + for _ in range(batch_size) + ] + + # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times + if do_sample is False: + beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32) + beam_scores_end = tf.ones((batch_size, num_beams - 1), dtype=tf.float32) * (-1e9) + beam_scores = tf.concat([beam_scores_begin, beam_scores_end], -1) + else: + beam_scores = tf.zeros((batch_size, num_beams), dtype=tf.float32) + + beam_scores = tf.reshape(beam_scores, (batch_size * num_beams,)) + + # cache compute states + past = encoder_outputs + + # done sentences + done = [False for _ in range(batch_size)] + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache + ) + outputs = self(**model_inputs) # (batch_size * num_beams, cur_len, vocab_size) + next_token_logits = outputs[0][:, -1, :] # (batch_size * num_beams, vocab_size) + + # if model has past, then set the past variable to speed up decoding + if self._use_cache(outputs, use_cache): + past = outputs[1] + + # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + next_token_logits_penalties = _create_next_token_logits_penalties( + input_ids, next_token_logits, repetition_penalty + ) + next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) + + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + next_token_logits = next_token_logits / temperature + + # calculate log softmax score + scores = tf.nn.log_softmax(next_token_logits, axis=-1) # (batch_size * num_beams, vocab_size) + + # set eos token prob to zero if min_length is not reached + if eos_token_id is not None and cur_len < min_length: + # create eos_token_id boolean mask + num_batch_hypotheses = batch_size * num_beams + + is_token_logit_eos_token = tf.convert_to_tensor( + [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool + ) + eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [num_batch_hypotheses, vocab_size]) + + scores = set_tensor_by_indices_to_value(scores, eos_token_indices_mask, -float("inf")) + + if no_repeat_ngram_size > 0: + # calculate a list of banned tokens to prevent repetitively generating the same ngrams + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + num_batch_hypotheses = batch_size * num_beams + banned_tokens = calc_banned_ngram_tokens( + input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len + ) + # create banned_tokens boolean mask + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + scores = set_tensor_by_indices_to_value( + scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + if bad_words_ids is not None: + # calculate a list of banned tokens according to bad words + banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) + + banned_tokens_indices_mask = [] + for banned_tokens_slice in banned_tokens: + banned_tokens_indices_mask.append( + [True if token in banned_tokens_slice else False for token in range(vocab_size)] + ) + + scores = set_tensor_by_indices_to_value( + scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") + ) + + assert shape_list(scores) == [batch_size * num_beams, vocab_size] + + if do_sample: + _scores = scores + tf.broadcast_to( + beam_scores[:, None], (batch_size * num_beams, vocab_size) + ) # (batch_size * num_beams, vocab_size) + + # Top-p/top-k filtering + _scores = tf_top_k_top_p_filtering( + _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 + ) # (batch_size * num_beams, vocab_size) + # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) + _scores = tf.reshape(_scores, (batch_size, num_beams * vocab_size)) + + next_tokens = sample_without_replacement( + _scores, num_samples=2 * num_beams + ) # (batch_size, 2 * num_beams) + # Compute next scores + next_scores = tf.gather(_scores, next_tokens, batch_dims=1) # (batch_size, 2 * num_beams) + + # sort the sampled vector to make sure that the first num_beams samples are the best + next_scores_indices = tf.argsort(next_scores, direction="DESCENDING", axis=1) + next_scores = tf.gather(next_scores, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) + next_tokens = tf.gather(next_tokens, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) + else: + # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) + next_scores = scores + tf.broadcast_to( + beam_scores[:, None], (batch_size * num_beams, vocab_size) + ) # (batch_size * num_beams, vocab_size) + + # re-organize to group the beam together (we are keeping top hypothesis accross beams) + next_scores = tf.reshape( + next_scores, (batch_size, num_beams * vocab_size) + ) # (batch_size, num_beams * vocab_size) + + next_scores, next_tokens = tf.math.top_k(next_scores, k=2 * num_beams, sorted=True) + + assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams] + + # next batch beam content + next_batch_beam = [] + + # for each sentence + for batch_idx in range(batch_size): + + # if we are done with this sentence + if done[batch_idx]: + assert ( + len(generated_hyps[batch_idx]) >= num_beams + ), "Batch can only be done if at least {} beams have been generated".format(num_beams) + assert ( + eos_token_id is not None and pad_token_id is not None + ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" + next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch + continue + + # next sentence beam content + next_sent_beam = [] + + # next tokens for this sentence + for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( + zip(next_tokens[batch_idx], next_scores[batch_idx]) + ): + # get beam and token IDs + beam_id = beam_token_id // vocab_size + token_id = beam_token_id % vocab_size + + effective_beam_id = batch_idx * num_beams + beam_id + # add to generated hypotheses if end of sentence or last iteration + if (eos_token_id is not None) and (token_id.numpy() == eos_token_id): + # if beam_token does not belong to top num_beams tokens, it should not be added + is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams + if is_beam_token_worse_than_top_num_beams: + continue + generated_hyps[batch_idx].add( + tf.identity(input_ids[effective_beam_id]), beam_token_score.numpy() + ) + else: + # add next predicted token if it is not eos_token + next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) + + # the beam for next step is full + if len(next_sent_beam) == num_beams: + break + + # Check if we are done so that we can save a pad step if all(done) + done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( + tf.reduce_max(next_scores[batch_idx]).numpy(), cur_len + ) + + # update next beam content + assert len(next_sent_beam) == num_beams, "Beam should always be full" + next_batch_beam.extend(next_sent_beam) + assert len(next_batch_beam) == num_beams * (batch_idx + 1) + + # stop when we are done with each sentence + if all(done): + break + + # sanity check / prepare next batch + assert len(next_batch_beam) == batch_size * num_beams + beam_scores = tf.convert_to_tensor([x[0] for x in next_batch_beam], dtype=tf.float32) + beam_tokens = tf.convert_to_tensor([x[1] for x in next_batch_beam], dtype=tf.int32) + beam_idx = tf.convert_to_tensor([x[2] for x in next_batch_beam], dtype=tf.int32) + + # re-order batch and update current length + input_ids = tf.stack([tf.identity(input_ids[x, :]) for x in beam_idx]) + input_ids = tf.concat([input_ids, tf.expand_dims(beam_tokens, 1)], axis=-1) + cur_len = cur_len + 1 + + # re-order internal states + if past is not None: + past = self._reorder_cache(past, beam_idx) + + # extend attention_mask for new generated input if only decoder + if self.config.is_encoder_decoder is False: + attention_mask = tf.concat( + [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 + ) + + # finalize all open beam hypotheses and end to generated hypotheses + for batch_idx in range(batch_size): + # Add all open beam hypothesis to generated_hyps + if done[batch_idx]: + continue + # test that beam scores match previously calculated scores if not eos and batch_idx not done + if eos_token_id is not None and all( + (token_id % vocab_size).numpy().item() != eos_token_id for token_id in next_tokens[batch_idx] + ): + assert tf.reduce_all( + next_scores[batch_idx, :num_beams] == tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] + ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( + next_scores[:, :num_beams][batch_idx], tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] + ) + + # need to add best num_beams hypotheses to generated hyps + for beam_id in range(num_beams): + effective_beam_id = batch_idx * num_beams + beam_id + final_score = beam_scores[effective_beam_id].numpy().item() + final_tokens = input_ids[effective_beam_id] + generated_hyps[batch_idx].add(final_tokens, final_score) + + # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch + output_batch_size = batch_size if do_sample else batch_size * num_return_sequences + output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences + + # select the best hypotheses + sent_lengths_list = [] + best = [] + + # retrieve best hypotheses + for i, hypotheses in enumerate(generated_hyps): + sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) + for j in range(output_num_return_sequences_per_batch): + best_hyp = sorted_hyps.pop()[1] + sent_lengths_list.append(len(best_hyp)) + best.append(best_hyp) + assert output_batch_size == len(best), "Output batch size {} must match output beam hypotheses {}".format( + output_batch_size, len(best) + ) + + sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32) + + # shorter batches are filled with pad_token + if tf.reduce_min(sent_lengths).numpy() != tf.reduce_max(sent_lengths).numpy(): + assert pad_token_id is not None, "`Pad_token_id` has to be defined" + sent_max_len = min(tf.reduce_max(sent_lengths).numpy() + 1, max_length) + decoded_list = [] + + # fill with hypothesis and eos_token_id if necessary + for i, hypo in enumerate(best): + assert sent_lengths[i] == shape_list(hypo)[0] + # if sent_length is max_len do not pad + if sent_lengths[i] == sent_max_len: + decoded_slice = hypo + else: + # else pad to sent_max_len + num_pad_tokens = sent_max_len - sent_lengths[i] + padding = pad_token_id * tf.ones((num_pad_tokens,), dtype=tf.int32) + decoded_slice = tf.concat([hypo, padding], axis=-1) + + # finish sentence with EOS token + if sent_lengths[i] < max_length: + decoded_slice = tf.where( + tf.range(sent_max_len, dtype=tf.int32) == sent_lengths[i], + eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32), + decoded_slice, + ) + # add to list + decoded_list.append(decoded_slice) + + decoded = tf.stack(decoded_list) + else: + # none of the hypotheses have an eos_token + assert (len(hypo) == max_length for hypo in best) + decoded = tf.stack(best) + + return decoded + + @staticmethod + def _reorder_cache(past, beam_idx): + return tuple(tf.gather(layer_past, beam_idx, axis=1) for layer_past in past) + + +def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty): + # create logit penalties for already seen input_ids + token_penalties = np.ones(shape_list(logits)) + prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] + for i, prev_input_id in enumerate(prev_input_ids): + logit_penalized = logits[i].numpy()[prev_input_id] + logit_penalties = np.zeros(logit_penalized.shape) + # if previous logit score is < 0 then multiply repetition penalty else divide + logit_penalties[logit_penalized < 0] = repetition_penalty + logit_penalties[logit_penalized > 0] = 1 / repetition_penalty + np.put(token_penalties[i], prev_input_id, logit_penalties) + return tf.convert_to_tensor(token_penalties, dtype=tf.float32) + + +def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): + # Copied from fairseq for no_repeat_ngram in beam_search""" + if cur_len + 1 < no_repeat_ngram_size: + # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + return [[] for _ in range(num_hypos)] + generated_ngrams = [{} for _ in range(num_hypos)] + for idx in range(num_hypos): + gen_tokens = prev_input_ids[idx].numpy().tolist() + generated_ngram = generated_ngrams[idx] + for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): + prev_ngram_tuple = tuple(ngram[:-1]) + generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] + + def _get_generated_ngrams(hypo_idx): + # Before decoding the next token, prevent decoding of ngrams that have already appeared + start_idx = cur_len + 1 - no_repeat_ngram_size + ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) + return generated_ngrams[hypo_idx].get(ngram_idx, []) + + banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] + return banned_tokens + + +def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): + banned_tokens = [] + + def _tokens_match(prev_tokens, tokens): + if len(tokens) == 0: + # if bad word tokens is just one token always ban it + return True + if len(tokens) > len(prev_input_ids): + # if bad word tokens are longer then prev input_ids they can't be equal + return False + + if prev_tokens[-len(tokens) :] == tokens: + # if tokens match + return True + else: + return False + + for prev_input_ids_slice in prev_input_ids: + banned_tokens_slice = [] + + for banned_token_seq in bad_words_ids: + assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format( + bad_words_ids + ) + + if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: + # if tokens do not match continue + continue + + banned_tokens_slice.append(banned_token_seq[-1]) + + banned_tokens.append(banned_tokens_slice) + + return banned_tokens + + +def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): + """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + Args: + logits: logits distribution shape (batch size, vocabulary size) + if top_k > 0: keep only top k tokens with highest probability (top-k filtering). + if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). + Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + Make sure we keep at least min_tokens_to_keep per batch example in the output + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + logits_shape = shape_list(logits) + + if top_k > 0: + top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1]) # Safety check + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None] + logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) + + if top_p < 1.0: + sorted_indices = tf.argsort(logits, direction="DESCENDING") + sorted_logits = tf.gather( + logits, sorted_indices, axis=-1, batch_dims=1 + ) # expects logits to be of dim (batch_size, vocab_size) + + cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1) + + # Remove tokens with cumulative probability above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs > top_p + + if min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove = tf.concat( + [ + tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]), + sorted_indices_to_remove[:, min_tokens_to_keep:], + ], + -1, + ) + + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1) + sorted_indices_to_remove = tf.concat( + [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1, + ) + # scatter sorted tensors to original indexing + indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices) + logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) + return logits + + +def scatter_values_on_batch_indices(values, batch_indices): + shape = shape_list(batch_indices) + # broadcast batch dim to shape + broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1]) + # transform batch_indices to pair_indices + pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0)) + # scatter values to pair indices + return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape) + + +def set_tensor_by_indices_to_value(tensor, indices, value): + # create value_tensor since tensor value assignment is not possible in TF + value_tensor = tf.zeros_like(tensor) + value + return tf.where(indices, value_tensor, tensor) + + +def sample_without_replacement(logits, num_samples): + """ + categorical sampling witouth replacement is currently not implemented + the gumbel-max trick will do for now + see https://github.com/tensorflow/tensorflow/issues/9260 for more info + """ + z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1)) + _, indices = tf.nn.top_k(logits + z, num_samples) + return indices + + +def shape_list(x): + """Deal with dynamic shape in tensorflow cleanly.""" + static = x.shape.as_list() + dynamic = tf.shape(x) + return [dynamic[i] if s is None else s for i, s in enumerate(static)] + + +class BeamHypotheses(object): + def __init__(self, num_beams, max_length, length_penalty, early_stopping): + """ + Initialize n-best list of hypotheses. + """ + self.max_length = max_length - 1 # ignoring bos_token + self.length_penalty = length_penalty + self.early_stopping = early_stopping + self.num_beams = num_beams + self.beams = [] + self.worst_score = 1e9 + + def __len__(self): + """ + Number of hypotheses in the list. + """ + return len(self.beams) + + def add(self, hyp, sum_logprobs): + """ + Add a new hypothesis to the list. + """ + score = sum_logprobs / len(hyp) ** self.length_penalty + if len(self) < self.num_beams or score > self.worst_score: + self.beams.append((score, hyp)) + if len(self) > self.num_beams: + sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) + del self.beams[sorted_scores[0][1]] + self.worst_score = sorted_scores[1][0] + else: + self.worst_score = min(score, self.worst_score) + + def is_done(self, best_sum_logprobs, cur_len): + """ + If there are enough hypotheses and that none of the hypotheses being generated + can become better than the worst one in the heap, then we are done with this sentence. + """ + + if len(self) < self.num_beams: + return False + elif self.early_stopping: + return True + else: + cur_score = best_sum_logprobs / cur_len ** self.length_penalty + ret = self.worst_score >= cur_score + return ret diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py new file mode 100644 index 0000000000..ec0bf803ee --- /dev/null +++ b/src/transformers/generation_utils.py @@ -0,0 +1,989 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Iterable, Optional, Tuple + +import torch +from torch import Tensor +from torch.nn import functional as F + + +logger = logging.getLogger(__name__) + + +class GenerationMixin: + """ + A class contraining all of the functions supporting generation, to be used as a mixin in PreTrainedModel. + """ + + def prepare_inputs_for_generation(self, input_ids, **kwargs): + return {"input_ids": input_ids} + + def adjust_logits_during_generation(self, logits, **kwargs): + return logits + + def _use_cache(self, outputs, use_cache): + """During generation, decide whether to pass the `past` variable to the next forward pass.""" + if len(outputs) <= 1 or use_cache is False: + return False + if hasattr(self.config, "mem_len") and self.config.mem_len == 0: + return False + return True + + def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty): + """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """ + for i in range(batch_size * num_beams): + for previous_token in set(prev_output_tokens[i].tolist()): + # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability + if lprobs[i, previous_token] < 0: + lprobs[i, previous_token] *= repetition_penalty + else: + lprobs[i, previous_token] /= repetition_penalty + + def postprocess_next_token_scores( + self, + scores, + input_ids, + no_repeat_ngram_size, + bad_words_ids, + cur_len, + min_length, + max_length, + eos_token_id, + repetition_penalty, + batch_size, + num_beams, + ): + # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + self.enforce_repetition_penalty_( + scores, batch_size, num_beams, input_ids, repetition_penalty, + ) + + # set eos token prob to zero if min_length is not reached + if eos_token_id is not None and cur_len < min_length: + scores[:, eos_token_id] = -float("inf") + + if no_repeat_ngram_size > 0: + # calculate a list of banned tokens to prevent repetitively generating the same ngrams + num_batch_hypotheses = batch_size * num_beams + # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 + banned_batch_tokens = calc_banned_ngram_tokens( + input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len + ) + for i, banned_tokens in enumerate(banned_batch_tokens): + scores[i, banned_tokens] = -float("inf") + + if bad_words_ids is not None: + # calculate a list of banned tokens according to bad words + banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) + + for i, banned_tokens in enumerate(banned_tokens): + scores[i, banned_tokens] = -float("inf") + + return scores + + @torch.no_grad() + def generate( + self, + input_ids: Optional[torch.LongTensor] = None, + max_length: Optional[int] = None, + min_length: Optional[int] = None, + do_sample: Optional[bool] = None, + early_stopping: Optional[bool] = None, + num_beams: Optional[int] = None, + temperature: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + repetition_penalty: Optional[float] = None, + bad_words_ids: Optional[Iterable[int]] = None, + bos_token_id: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + length_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + num_return_sequences: Optional[int] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_start_token_id: Optional[int] = None, + use_cache: Optional[bool] = None, + **model_specific_kwargs + ) -> torch.LongTensor: + r""" Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. + + Adapted in part from `Facebook's XLM beam search code`_. + + .. _`Facebook's XLM beam search code`: + https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 + + + Parameters: + + input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)` + The sequence used as a prompt for the generation. If `None` the method initializes + it as an empty `torch.LongTensor` of shape `(1,)`. + + max_length: (`optional`) int + The max length of the sequence to be generated. Between `min_length` and infinity. Default to 20. + + min_length: (`optional`) int + The min length of the sequence to be generated. Between 0 and infinity. Default to 0. + + do_sample: (`optional`) bool + If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. + + early_stopping: (`optional`) bool + if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. + + num_beams: (`optional`) int + Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. + + temperature: (`optional`) float + The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. + + top_k: (`optional`) int + The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. + + top_p: (`optional`) float + The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. + + repetition_penalty: (`optional`) float + The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. + + pad_token_id: (`optional`) int + Padding token. Default to specicic model pad_token_id or None if it does not exist. + + bos_token_id: (`optional`) int + BOS token. Defaults to `bos_token_id` as defined in the models config. + + eos_token_id: (`optional`) int + EOS token. Defaults to `eos_token_id` as defined in the models config. + + length_penalty: (`optional`) float + Exponential penalty to the length. Default to 1. + + no_repeat_ngram_size: (`optional`) int + If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. + bad_words_ids: (`optional`) list of lists of int + `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. + + num_return_sequences: (`optional`) int + The number of independently computed returned sequences for each element in the batch. Default to 1. + + attention_mask (`optional`) obj: `torch.LongTensor` of same shape as `input_ids` + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + Defaults to `None`. + + `What are attention masks? <../glossary.html#attention-mask>`__ + + decoder_start_token_id=None: (`optional`) int + If an encoder-decoder model starts decoding with a different token than BOS. + Defaults to `None` and is changed to `BOS` later. + + use_cache: (`optional`) bool + If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`. + + model_specific_kwargs: (`optional`) dict + Additional model specific kwargs will be forwarded to the `forward` function of the model. + + Return: + + output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)` + sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` + + Examples:: + + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + outputs = model.generate(max_length=40) # do greedy decoding + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. + input_context = 'The dog' + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling + for i in range(3): # 3 output sequences were generated + print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. + input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences + print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) + + tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer + model = AutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. + input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl + bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] + input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context + outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated + """ + + # We cannot generate if the model does not have a LM head + if self.get_output_embeddings() is None: + raise AttributeError( + "You tried to generate sequences with a model that does not have a LM Head." + "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )" + ) + + max_length = max_length if max_length is not None else self.config.max_length + min_length = min_length if min_length is not None else self.config.min_length + do_sample = do_sample if do_sample is not None else self.config.do_sample + early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping + use_cache = use_cache if use_cache is not None else self.config.use_cache + num_beams = num_beams if num_beams is not None else self.config.num_beams + temperature = temperature if temperature is not None else self.config.temperature + top_k = top_k if top_k is not None else self.config.top_k + top_p = top_p if top_p is not None else self.config.top_p + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty + bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty + no_repeat_ngram_size = ( + no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size + ) + bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids + num_return_sequences = ( + num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences + ) + decoder_start_token_id = ( + decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id + ) + + if input_ids is not None: + batch_size = input_ids.shape[0] # overriden by the input batch_size + else: + batch_size = 1 + + assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." + assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." + assert isinstance(do_sample, bool), "`do_sample` should be a boolean." + assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." + assert isinstance(use_cache, bool), "`use_cache` should be a boolean." + assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." + assert temperature > 0, "`temperature` should be strictly positive." + assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." + assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." + assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." + assert input_ids is not None or ( + isinstance(bos_token_id, int) and bos_token_id >= 0 + ), "If input_ids is not defined, `bos_token_id` should be a positive integer." + assert pad_token_id is None or ( + isinstance(pad_token_id, int) and (pad_token_id >= 0) + ), "`pad_token_id` should be a positive integer." + assert (eos_token_id is None) or ( + isinstance(eos_token_id, int) and (eos_token_id >= 0) + ), "`eos_token_id` should be a positive integer." + assert length_penalty > 0, "`length_penalty` should be strictly positive." + assert ( + isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0 + ), "`no_repeat_ngram_size` should be a positive integer." + assert ( + isinstance(num_return_sequences, int) and num_return_sequences > 0 + ), "`num_return_sequences` should be a strictly positive integer." + assert ( + bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) + ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" + + if input_ids is None: + assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( + "you should either supply a context to complete as `input_ids` input " + "or a `bos_token_id` (integer >= 0) as a first token to start the generation." + ) + input_ids = torch.full( + (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device, + ) + else: + assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)." + + # not allow to duplicate outputs when greedy decoding + if do_sample is False: + if num_beams == 1: + # no_beam_search greedy generation conditions + assert ( + num_return_sequences == 1 + ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" + + else: + # beam_search greedy generation conditions + assert ( + num_beams >= num_return_sequences + ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" + + # create attention mask if necessary + # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 + if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids): + attention_mask = input_ids.ne(pad_token_id).long() + elif attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + + # set pad_token_id to eos_token_id if not set. Important that this is done after + # attention_mask is created + if pad_token_id is None and eos_token_id is not None: + logger.warning( + "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) + ) + pad_token_id = eos_token_id + + # current position and vocab size + if hasattr(self.config, "vocab_size"): + vocab_size = self.config.vocab_size + elif ( + self.config.is_encoder_decoder + and hasattr(self.config, "decoder") + and hasattr(self.config.decoder, "vocab_size") + ): + vocab_size = self.config.decoder.vocab_size + + # set effective batch size and effective batch multiplier according to do_sample + if do_sample: + effective_batch_size = batch_size * num_return_sequences + effective_batch_mult = num_return_sequences + else: + effective_batch_size = batch_size + effective_batch_mult = 1 + + if self.config.is_encoder_decoder: + if decoder_start_token_id is None: + decoder_start_token_id = bos_token_id + + assert ( + decoder_start_token_id is not None + ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" + assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) + assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) + + # get encoder and store encoder outputs + encoder = self.get_encoder() + + encoder_outputs: tuple = encoder(input_ids, attention_mask=attention_mask) + + # Expand input ids if num_beams > 1 or num_return_sequences > 1 + if num_return_sequences > 1 or num_beams > 1: + input_ids_len = input_ids.shape[-1] + input_ids = input_ids.unsqueeze(1).expand(batch_size, effective_batch_mult * num_beams, input_ids_len) + attention_mask = attention_mask.unsqueeze(1).expand( + batch_size, effective_batch_mult * num_beams, input_ids_len + ) + + input_ids = input_ids.contiguous().view( + effective_batch_size * num_beams, input_ids_len + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + attention_mask = attention_mask.contiguous().view( + effective_batch_size * num_beams, input_ids_len + ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) + + if self.config.is_encoder_decoder: + # create empty decoder_input_ids + input_ids = torch.full( + (effective_batch_size * num_beams, 1), + decoder_start_token_id, + dtype=torch.long, + device=next(self.parameters()).device, + ) + cur_len = 1 + + assert ( + batch_size == encoder_outputs[0].shape[0] + ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " + + # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) + expanded_batch_idxs = ( + torch.arange(batch_size) + .view(-1, 1) + .repeat(1, num_beams * effective_batch_mult) + .view(-1) + .to(input_ids.device) + ) + # expand encoder_outputs + encoder_outputs = (encoder_outputs[0].index_select(0, expanded_batch_idxs), *encoder_outputs[1:]) + + else: + encoder_outputs = None + cur_len = input_ids.shape[-1] + + if num_beams > 1: + output = self._generate_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + early_stopping=early_stopping, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + batch_size=effective_batch_size, + num_return_sequences=num_return_sequences, + length_penalty=length_penalty, + num_beams=num_beams, + vocab_size=vocab_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + use_cache=use_cache, + model_specific_kwargs=model_specific_kwargs, + ) + else: + output = self._generate_no_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + batch_size=effective_batch_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + use_cache=use_cache, + model_specific_kwargs=model_specific_kwargs, + ) + + return output + + def _generate_no_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + pad_token_id, + eos_token_id, + batch_size, + encoder_outputs, + attention_mask, + use_cache, + model_specific_kwargs, + ): + """ Generate sequences for each example without beam search (num_beams == 1). + All returned sequence are generated independantly. + """ + # length of generated sentences / unfinished sentences + unfinished_sents = input_ids.new(batch_size).fill_(1) + sent_lengths = input_ids.new(batch_size).fill_(max_length) + + past = (encoder_outputs, None) if encoder_outputs is not None else None + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs + ) + + outputs = self(**model_inputs) + next_token_logits = outputs[0][:, -1, :] + + scores = self.postprocess_next_token_scores( + scores=next_token_logits, + input_ids=input_ids, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + cur_len=cur_len, + min_length=min_length, + max_length=max_length, + eos_token_id=eos_token_id, + repetition_penalty=repetition_penalty, + batch_size=batch_size, + num_beams=1, + ) + + # if model has past, then set the past variable to speed up decoding + if self._use_cache(outputs, use_cache): + past = outputs[1] + + if do_sample: + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + scores = scores / temperature + # Top-p/top-k filtering + next_token_logscores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p) + # Sample + probs = F.softmax(next_token_logscores, dim=-1) + next_token = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + # Greedy decoding + next_token = torch.argmax(next_token_logits, dim=-1) + + # update generations and finished sentences + if eos_token_id is not None: + # pad finished sentences if eos_token_id exist + tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) + else: + tokens_to_add = next_token + + # add token and increase length by one + input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1) + cur_len = cur_len + 1 + + if eos_token_id is not None: + eos_in_sents = tokens_to_add == eos_token_id + # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length + is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool() + sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len) + # unfinished_sents is set to zero if eos in sentence + unfinished_sents.mul_((~eos_in_sents).long()) + + # stop when there is a in each sentence, or if we exceed the maximul length + if unfinished_sents.max() == 0: + break + + # extend attention_mask for new generated input if only decoder + if self.config.is_encoder_decoder is False: + attention_mask = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + + return input_ids + + def _generate_beam_search( + self, + input_ids, + cur_len, + max_length, + min_length, + do_sample, + early_stopping, + temperature, + top_k, + top_p, + repetition_penalty, + no_repeat_ngram_size, + bad_words_ids, + pad_token_id, + eos_token_id, + batch_size, + num_return_sequences, + length_penalty, + num_beams, + vocab_size, + encoder_outputs, + attention_mask, + use_cache, + model_specific_kwargs, + ): + """ Generate sequences for each example with beam search. + """ + + # generated hypotheses + generated_hyps = [ + BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) + for _ in range(batch_size) + ] + + # scores for each sentence in the beam + beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) + + # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times + if do_sample is False: + beam_scores[:, 1:] = -1e9 + beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) + + # cache compute states + past = (encoder_outputs, None) if encoder_outputs is not None else None + + # done sentences + done = [False for _ in range(batch_size)] + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation( + input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs + ) + outputs = self(**model_inputs) # (batch_size * num_beams, cur_len, vocab_size) + next_token_logits = outputs[0][:, -1, :] # (batch_size * num_beams, vocab_size) + + # if model has past, then set the past variable to speed up decoding + if self._use_cache(outputs, use_cache): + past = outputs[1] + if self.config.is_encoder_decoder and do_sample is False: + # TODO (PVP) still a bit hacky here - there might be a better solution + next_token_logits = self.adjust_logits_during_generation( + next_token_logits, cur_len=cur_len, max_length=max_length + ) + + scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size) + + scores = self.postprocess_next_token_scores( + scores=scores, + input_ids=input_ids, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + cur_len=cur_len, + min_length=min_length, + max_length=max_length, + eos_token_id=eos_token_id, + repetition_penalty=repetition_penalty, + batch_size=batch_size, + num_beams=num_beams, + ) + + assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format( + scores.shape, (batch_size * num_beams, vocab_size) + ) + + if do_sample: + _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) + # Temperature + if temperature != 1.0: + _scores = _scores / temperature + # Top-p/top-k filtering + _scores = top_k_top_p_filtering( + _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 + ) # (batch_size * num_beams, vocab_size) + # re-organize to group the beam together to sample from all beam_idxs + _scores = _scores.contiguous().view( + batch_size, num_beams * vocab_size + ) # (batch_size, num_beams * vocab_size) + + # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) + probs = F.softmax(_scores, dim=-1) + next_tokens = torch.multinomial(probs, num_samples=2 * num_beams) # (batch_size, num_beams * 2) + # Compute next scores + next_scores = torch.gather(_scores, -1, next_tokens) # (batch_size, num_beams * 2) + # sort the sampled vector to make sure that the first num_beams samples are the best + next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1) + next_tokens = torch.gather(next_tokens, -1, next_scores_indices) # (batch_size, num_beams * 2) + + else: + next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) + + # re-organize to group the beam together (we are keeping top hypothesis accross beams) + next_scores = next_scores.view( + batch_size, num_beams * vocab_size + ) # (batch_size, num_beams * vocab_size) + + next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True) + + assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams) + + # next batch beam content + next_batch_beam = [] + + # for each sentence + for batch_idx in range(batch_size): + + # if we are done with this sentence, add a pad token + if done[batch_idx]: + assert ( + len(generated_hyps[batch_idx]) >= num_beams + ), "Batch can only be done if at least {} beams have been generated".format(num_beams) + assert ( + eos_token_id is not None and pad_token_id is not None + ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" + next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch + continue + + # next sentence beam content, this will get added to next_batch_beam + next_sent_beam = [] + + # next tokens for this sentence + for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( + zip(next_tokens[batch_idx], next_scores[batch_idx]) + ): + # get beam and token IDs + beam_id = beam_token_id // vocab_size + token_id = beam_token_id % vocab_size + + effective_beam_id = batch_idx * num_beams + beam_id + # add to generated hypotheses if end of sentence + if (eos_token_id is not None) and (token_id.item() == eos_token_id): + # if beam_token does not belong to top num_beams tokens, it should not be added + is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams + if is_beam_token_worse_than_top_num_beams: + continue + generated_hyps[batch_idx].add( + input_ids[effective_beam_id].clone(), beam_token_score.item(), + ) + else: + # add next predicted token since it is not eos_token + next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) + + # once the beam for next step is full, don't add more tokens to it. + if len(next_sent_beam) == num_beams: + break + + # Check if we are done so that we can save a pad step if all(done) + done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( + next_scores[batch_idx].max().item(), cur_len + ) + + # update next beam content + assert len(next_sent_beam) == num_beams, "Beam should always be full" + next_batch_beam.extend(next_sent_beam) + assert len(next_batch_beam) == num_beams * (batch_idx + 1), "We should have added num_beams each step" + + # stop when we are done with each sentence + if all(done): + break + + # sanity check / prepare next batch + assert len(next_batch_beam) == batch_size * num_beams + beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) + beam_tokens = input_ids.new([x[1] for x in next_batch_beam]) + beam_idx = input_ids.new([x[2] for x in next_batch_beam]) + + # re-order batch and update current length + input_ids = input_ids[beam_idx, :] + input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1) + cur_len = cur_len + 1 + + # re-order internal states + if past is not None: + past = self._reorder_cache(past, beam_idx) + + # extend attention_mask for new generated input if only decoder + if self.config.is_encoder_decoder is False: + attention_mask = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + + # finalize all open beam hypotheses and add to generated hypotheses + for batch_idx in range(batch_size): + if done[batch_idx]: + continue + + # test that beam scores match previously calculated scores if not eos and batch_idx not done + if eos_token_id is not None and all( + (token_id % vocab_size).item() != eos_token_id for token_id in next_tokens[batch_idx] + ): + assert torch.all( + next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx] + ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( + next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx], + ) + + # need to add best num_beams hypotheses to generated hyps + for beam_id in range(num_beams): + effective_beam_id = batch_idx * num_beams + beam_id + final_score = beam_scores[effective_beam_id].item() + final_tokens = input_ids[effective_beam_id] + generated_hyps[batch_idx].add(final_tokens, final_score) + + # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch + output_batch_size = batch_size if do_sample else batch_size * num_return_sequences + output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences + + # select the best hypotheses + sent_lengths = input_ids.new(output_batch_size) + best = [] + + # retrieve best hypotheses + for i, hypotheses in enumerate(generated_hyps): + sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) + for j in range(output_num_return_sequences_per_batch): + effective_batch_idx = output_num_return_sequences_per_batch * i + j + best_hyp = sorted_hyps.pop()[1] + sent_lengths[effective_batch_idx] = len(best_hyp) + best.append(best_hyp) + + # shorter batches are padded + if sent_lengths.min().item() != sent_lengths.max().item(): + assert pad_token_id is not None, "`Pad_token_id` has to be defined" + sent_max_len = min(sent_lengths.max().item() + 1, max_length) + decoded = input_ids.new(output_batch_size, sent_max_len).fill_(pad_token_id) + + # fill with hypothesis and eos_token_id if necessary + for i, hypo in enumerate(best): + decoded[i, : sent_lengths[i]] = hypo + if sent_lengths[i] < max_length: + decoded[i, sent_lengths[i]] = eos_token_id + else: + # none of the hypotheses have an eos_token + assert (len(hypo) == max_length for hypo in best) + decoded = torch.stack(best).type(torch.long).to(next(self.parameters()).device) + + return decoded + + @staticmethod + def _reorder_cache(past: Tuple, beam_idx: Tensor) -> Tuple[Tensor]: + return tuple(layer_past.index_select(1, beam_idx) for layer_past in past) + + +def calc_banned_ngram_tokens(prev_input_ids: Tensor, num_hypos: int, no_repeat_ngram_size: int, cur_len: int) -> None: + """Copied from fairseq for no_repeat_ngram in beam_search""" + if cur_len + 1 < no_repeat_ngram_size: + # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + return [[] for _ in range(num_hypos)] + generated_ngrams = [{} for _ in range(num_hypos)] + for idx in range(num_hypos): + gen_tokens = prev_input_ids[idx].tolist() + generated_ngram = generated_ngrams[idx] + for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): + prev_ngram_tuple = tuple(ngram[:-1]) + generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] + + def _get_generated_ngrams(hypo_idx): + # Before decoding the next token, prevent decoding of ngrams that have already appeared + start_idx = cur_len + 1 - no_repeat_ngram_size + ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].tolist()) + return generated_ngrams[hypo_idx].get(ngram_idx, []) + + banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] + return banned_tokens + + +def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iterable[int]) -> Iterable[int]: + banned_tokens = [] + + def _tokens_match(prev_tokens, tokens): + if len(tokens) == 0: + # if bad word tokens is just one token always ban it + return True + if len(tokens) > len(prev_input_ids): + # if bad word tokens are longer then prev input_ids they can't be equal + return False + + if prev_tokens[-len(tokens) :] == tokens: + # if tokens match + return True + else: + return False + + for prev_input_ids_slice in prev_input_ids: + banned_tokens_slice = [] + + for banned_token_seq in bad_words_ids: + assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format( + bad_words_ids + ) + + if _tokens_match(prev_input_ids_slice.tolist(), banned_token_seq[:-1]) is False: + # if tokens do not match continue + continue + + banned_tokens_slice.append(banned_token_seq[-1]) + + banned_tokens.append(banned_tokens_slice) + + return banned_tokens + + +def top_k_top_p_filtering( + logits: Tensor, + top_k: int = 0, + top_p: float = 1.0, + filter_value: float = -float("Inf"), + min_tokens_to_keep: int = 1, +) -> Tensor: + """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + Args: + logits: logits distribution shape (batch size, vocabulary size) + if top_k > 0: keep only top k tokens with highest probability (top-k filtering). + if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). + Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + Make sure we keep at least min_tokens_to_keep per batch example in the output + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + if top_k > 0: + top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = filter_value + + if top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs > top_p + if min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) + logits[indices_to_remove] = filter_value + return logits + + +class BeamHypotheses(object): + def __init__(self, num_beams, max_length, length_penalty, early_stopping): + """ + Initialize n-best list of hypotheses. + """ + self.max_length = max_length - 1 # ignoring bos_token + self.length_penalty = length_penalty + self.early_stopping = early_stopping + self.num_beams = num_beams + self.beams = [] + self.worst_score = 1e9 + + def __len__(self): + """ + Number of hypotheses in the list. + """ + return len(self.beams) + + def add(self, hyp, sum_logprobs): + """ + Add a new hypothesis to the list. + """ + score = sum_logprobs / len(hyp) ** self.length_penalty + if len(self) < self.num_beams or score > self.worst_score: + self.beams.append((score, hyp)) + if len(self) > self.num_beams: + sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) + del self.beams[sorted_scores[0][1]] + self.worst_score = sorted_scores[1][0] + else: + self.worst_score = min(score, self.worst_score) + + def is_done(self, best_sum_logprobs, cur_len): + """ + If there are enough hypotheses and that none of the hypotheses being generated + can become better than the worst one in the heap, then we are done with this sentence. + """ + + if len(self) < self.num_beams: + return False + elif self.early_stopping: + return True + else: + cur_score = best_sum_logprobs / cur_len ** self.length_penalty + ret = self.worst_score >= cur_score + return ret diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 648e434f48..741d013b62 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -25,6 +25,7 @@ from tensorflow.python.keras.saving import hdf5_format from .configuration_utils import PretrainedConfig from .file_utils import DUMMY_INPUTS, TF2_WEIGHTS_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_remote_url +from .generation_tf_utils import TFGenerationMixin from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model @@ -144,7 +145,7 @@ class TFSequenceClassificationLoss: TFMultipleChoiceLoss = TFSequenceClassificationLoss -class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin): +class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): r""" Base class for all TF models. :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models @@ -538,1053 +539,6 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin): return model - def prepare_inputs_for_generation(self, inputs, **kwargs): - return {"inputs": inputs} - - def _use_cache(self, outputs, use_cache): - """During generation, decide whether to pass the `past` variable to the next forward pass.""" - if len(outputs) <= 1 or use_cache is False: - return False - if hasattr(self.config, "mem_len") and self.config.mem_len == 0: - return False - return True - - def generate( - self, - input_ids=None, - max_length=None, - min_length=None, - do_sample=None, - early_stopping=None, - num_beams=None, - temperature=None, - top_k=None, - top_p=None, - repetition_penalty=None, - bad_words_ids=None, - bos_token_id=None, - pad_token_id=None, - eos_token_id=None, - length_penalty=None, - no_repeat_ngram_size=None, - num_return_sequences=None, - attention_mask=None, - decoder_start_token_id=None, - use_cache=None, - ): - r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling - and beam-search. - - Adapted in part from `Facebook's XLM beam search code`_. - - .. _`Facebook's XLM beam search code`: - https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 - - - Parameters: - - input_ids: (`optional`) `tf.Tensor` of `dtype=tf.int32` of shape `(batch_size, sequence_length)` - The sequence used as a prompt for the generation. If `None` the method initializes - it as an empty `tf.Tensor` of shape `(1,)`. - - max_length: (`optional`) int - The max length of the sequence to be generated. Between 1 and infinity. Default to 20. - - min_length: (`optional`) int - The min length of the sequence to be generated. Between 0 and infinity. Default to 0. - do_sample: (`optional`) bool - If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. - - early_stopping: (`optional`) bool - if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. - - num_beams: (`optional`) int - Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. - - temperature: (`optional`) float - The value used to module the next token probabilities. Must be strictely positive. Default to 1.0. - - top_k: (`optional`) int - The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. - - top_p: (`optional`) float - The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. - - repetition_penalty: (`optional`) float - The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. - - bos_token_id: (`optional`) int - Beginning of sentence token if no prompt is provided. Default to specicic model bos_token_id or None if it does not exist. - - pad_token_id: (`optional`) int - Pad token. Defaults to pad_token_id as defined in the models config. - - eos_token_id: (`optional`) int - EOS token. Defaults to eos_token_id as defined in the models config. - - length_penalty: (`optional`) float - Exponential penalty to the length. Default to 1. - - no_repeat_ngram_size: (`optional`) int - If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. - - bad_words_ids: (`optional`) list of lists of int - `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. - - num_return_sequences: (`optional`) int - The number of independently computed returned sequences for each element in the batch. Default to 1. - - attention_mask (`optional`) obj: `tf.Tensor` with `dtype=tf.int32` of same shape as `input_ids` - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - Defaults to `None`. - - `What are attention masks? <../glossary.html#attention-mask>`__ - - decoder_start_token_id=None: (`optional`) int - Start token id for the decoder. Defaults to ``decoder_start_token_id`` as defined the model's config or to the ``bos_token_id`` - if no ``decoder_start_token_id`` is found in the config. - This is only relevant for encoder-decoder models. - - use_cache: (`optional`) bool - If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`. - - Return: - - output: `tf.Tensor` of `dtype=tf.int32` shape `(batch_size * num_return_sequences, sequence_length)` - sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` - - Examples:: - - tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. - outputs = model.generate(max_length=40) # do greedy decoding - print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) - - tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. - input_context = 'The dog' - input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context - outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' - for i in range(3): # 3 output sequences were generated - print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) - - tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. - input_context = 'The dog' - input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context - outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling - for i in range(3): # 3 output sequences were generated - print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) - - tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. - input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl - input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context - outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences - print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) - - tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer - model = TFAutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. - input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl - bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] - input_ids = tokenizer.encode(input_context, return_tensors='tf') # encode input context - outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated - """ - - # We cannot generate if the model does not have a LM head - if self.get_output_embeddings() is None: - raise AttributeError( - "You tried to generate sequences with a model that does not have a LM Head." - "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)" - ) - - max_length = max_length if max_length is not None else self.config.max_length - min_length = min_length if min_length is not None else self.config.min_length - do_sample = do_sample if do_sample is not None else self.config.do_sample - early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping - use_cache = use_cache if use_cache is not None else self.config.use_cache - num_beams = num_beams if num_beams is not None else self.config.num_beams - temperature = temperature if temperature is not None else self.config.temperature - top_k = top_k if top_k is not None else self.config.top_k - top_p = top_p if top_p is not None else self.config.top_p - repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty - bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id - pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id - length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty - no_repeat_ngram_size = ( - no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size - ) - bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids - num_return_sequences = ( - num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences - ) - decoder_start_token_id = ( - decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id - ) - - if input_ids is not None: - batch_size = shape_list(input_ids)[0] # overriden by the input batch_size - else: - batch_size = 1 - - assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer." - assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." - assert isinstance(do_sample, bool), "`do_sample` should be a boolean." - assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." - assert isinstance(use_cache, bool), "`use_cache` should be a boolean." - assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer." - assert temperature > 0, "`temperature` should be strictely positive." - assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." - assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." - assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." - assert input_ids is not None or ( - isinstance(bos_token_id, int) and bos_token_id >= 0 - ), "If input_ids is not defined, `bos_token_id` should be a positive integer." - assert pad_token_id is None or ( - isinstance(pad_token_id, int) and (pad_token_id >= 0) - ), "`pad_token_id` should be a positive integer." - assert (eos_token_id is None) or ( - isinstance(eos_token_id, int) and (eos_token_id >= 0) - ), "`eos_token_id` should be a positive integer." - assert length_penalty > 0, "`length_penalty` should be strictely positive." - assert ( - isinstance(num_return_sequences, int) and num_return_sequences > 0 - ), "`num_return_sequences` should be a strictely positive integer." - assert ( - bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) - ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" - - if input_ids is None: - assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( - "you should either supply a context to complete as `input_ids` input " - "or a `bos_token_id` (integer >= 0) as a first token to start the generation." - ) - input_ids = tf.fill((batch_size, 1), bos_token_id) - else: - assert len(shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)." - - # not allow to duplicate outputs when greedy decoding - if do_sample is False: - if num_beams == 1: - # no_beam_search greedy generation conditions - assert ( - num_return_sequences == 1 - ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" - - else: - # beam_search greedy generation conditions - assert ( - num_beams >= num_return_sequences - ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" - - # create attention mask if necessary - # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 - if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): - attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) - elif attention_mask is None: - attention_mask = tf.ones_like(input_ids) - - if pad_token_id is None and eos_token_id is not None: - logger.warning( - "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) - ) - pad_token_id = eos_token_id - - # current position and vocab size - cur_len = shape_list(input_ids)[1] - vocab_size = self.config.vocab_size - - # set effective batch size and effective batch multiplier according to do_sample - if do_sample: - effective_batch_size = batch_size * num_return_sequences - effective_batch_mult = num_return_sequences - else: - effective_batch_size = batch_size - effective_batch_mult = 1 - - if self.config.is_encoder_decoder: - if decoder_start_token_id is None: - decoder_start_token_id = bos_token_id - - assert ( - decoder_start_token_id is not None - ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" - assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) - assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) - - # get encoder and store encoder outputs - encoder = self.get_encoder() - - encoder_outputs = encoder(input_ids, attention_mask=attention_mask) - - # Expand input ids if num_beams > 1 or num_return_sequences > 1 - if num_return_sequences > 1 or num_beams > 1: - input_ids_len = shape_list(input_ids)[-1] - input_ids = tf.broadcast_to( - tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) - ) - attention_mask = tf.broadcast_to( - tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) - ) - input_ids = tf.reshape( - input_ids, (effective_batch_size * num_beams, input_ids_len) - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) - attention_mask = tf.reshape( - attention_mask, (effective_batch_size * num_beams, input_ids_len) - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) - - if self.config.is_encoder_decoder: - - # create empty decoder_input_ids - input_ids = tf.ones((effective_batch_size * num_beams, 1), dtype=tf.int32,) * decoder_start_token_id - cur_len = 1 - - assert ( - batch_size == encoder_outputs[0].shape[0] - ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " - - # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) - expanded_batch_idxs = tf.reshape( - tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), - shape=(-1,), - ) - # expand encoder_outputs - encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0), *encoder_outputs[1:]) - - else: - encoder_outputs = None - cur_len = shape_list(input_ids)[-1] - - if num_beams > 1: - output = self._generate_beam_search( - input_ids, - cur_len=cur_len, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - early_stopping=early_stopping, - temperature=temperature, - top_k=top_k, - top_p=top_p, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - bos_token_id=bos_token_id, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - decoder_start_token_id=decoder_start_token_id, - batch_size=effective_batch_size, - num_return_sequences=num_return_sequences, - length_penalty=length_penalty, - num_beams=num_beams, - vocab_size=vocab_size, - encoder_outputs=encoder_outputs, - attention_mask=attention_mask, - use_cache=use_cache, - ) - else: - output = self._generate_no_beam_search( - input_ids, - cur_len=cur_len, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - temperature=temperature, - top_k=top_k, - top_p=top_p, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - bos_token_id=bos_token_id, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - decoder_start_token_id=decoder_start_token_id, - batch_size=effective_batch_size, - vocab_size=vocab_size, - encoder_outputs=encoder_outputs, - attention_mask=attention_mask, - use_cache=use_cache, - ) - - return output - - def _generate_no_beam_search( - self, - input_ids, - cur_len, - max_length, - min_length, - do_sample, - temperature, - top_k, - top_p, - repetition_penalty, - no_repeat_ngram_size, - bad_words_ids, - bos_token_id, - pad_token_id, - eos_token_id, - decoder_start_token_id, - batch_size, - vocab_size, - encoder_outputs, - attention_mask, - use_cache, - ): - """ Generate sequences for each example without beam search (num_beams == 1). - All returned sequence are generated independantly. - """ - - # length of generated sentences / unfinished sentences - unfinished_sents = tf.ones_like(input_ids[:, 0]) - sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length - - past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models - - while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation( - input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache - ) - outputs = self(**model_inputs) - next_token_logits = outputs[0][:, -1, :] - - # if model has past, then set the past variable to speed up decoding - if self._use_cache(outputs, use_cache): - past = outputs[1] - - # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) - if repetition_penalty != 1.0: - next_token_logits_penalties = _create_next_token_logits_penalties( - input_ids, next_token_logits, repetition_penalty - ) - next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) - - if no_repeat_ngram_size > 0: - # calculate a list of banned tokens to prevent repetitively generating the same ngrams - # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 - banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) - # create banned_tokens boolean mask - banned_tokens_indices_mask = [] - for banned_tokens_slice in banned_tokens: - banned_tokens_indices_mask.append( - [True if token in banned_tokens_slice else False for token in range(vocab_size)] - ) - - next_token_logits = set_tensor_by_indices_to_value( - next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") - ) - - if bad_words_ids is not None: - # calculate a list of banned tokens according to bad words - banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) - - banned_tokens_indices_mask = [] - for banned_tokens_slice in banned_tokens: - banned_tokens_indices_mask.append( - [True if token in banned_tokens_slice else False for token in range(vocab_size)] - ) - - next_token_logits = set_tensor_by_indices_to_value( - next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") - ) - - # set eos token prob to zero if min_length is not reached - if eos_token_id is not None and cur_len < min_length: - # create eos_token_id boolean mask - is_token_logit_eos_token = tf.convert_to_tensor( - [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool - ) - eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [batch_size, vocab_size]) - - next_token_logits = set_tensor_by_indices_to_value( - next_token_logits, eos_token_indices_mask, -float("inf") - ) - - if do_sample: - # Temperature (higher temperature => more likely to sample low probability tokens) - if temperature != 1.0: - next_token_logits = next_token_logits / temperature - # Top-p/top-k filtering - next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) - # Sample - next_token = tf.squeeze( - tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1 - ) - else: - # Greedy decoding - next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32) - - # update generations and finished sentences - if eos_token_id is not None: - # pad finished sentences if eos_token_id exist - tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) - else: - tokens_to_add = next_token - - # add token and increase length by one - input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1) - cur_len = cur_len + 1 - - if eos_token_id is not None: - eos_in_sents = tokens_to_add == eos_token_id - # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length - is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( - unfinished_sents, tf.cast(eos_in_sents, tf.int32) - ) - sent_lengths = ( - sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos) - + cur_len * is_sents_unfinished_and_token_to_add_is_eos - ) - - # unfinished_sents is set to zero if eos in sentence - unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos - - # stop when there is a in each sentence, or if we exceed the maximul length - if tf.math.reduce_max(unfinished_sents) == 0: - break - - # extend attention_mask for new generated input if only decoder - if self.config.is_encoder_decoder is False: - attention_mask = tf.concat( - [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 - ) - - # if there are different sentences lengths in the batch, some batches have to be padded - min_sent_length = tf.math.reduce_min(sent_lengths) - max_sent_length = tf.math.reduce_max(sent_lengths) - if min_sent_length != max_sent_length: - assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" - # finished sents are filled with pad_token - padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id - - # create length masks for tf.where operation - broad_casted_sent_lengths = tf.broadcast_to( - tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length] - ) - broad_casted_range = tf.transpose( - tf.broadcast_to(tf.expand_dims(tf.range(max_sent_length), -1), [max_sent_length, batch_size]) - ) - - decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding) - else: - decoded = input_ids - - return decoded - - def _generate_beam_search( - self, - input_ids, - cur_len, - max_length, - min_length, - do_sample, - early_stopping, - temperature, - top_k, - top_p, - repetition_penalty, - no_repeat_ngram_size, - bad_words_ids, - bos_token_id, - pad_token_id, - decoder_start_token_id, - eos_token_id, - batch_size, - num_return_sequences, - length_penalty, - num_beams, - vocab_size, - encoder_outputs, - attention_mask, - use_cache, - ): - """ Generate sequences for each example with beam search. - """ - - # generated hypotheses - generated_hyps = [ - BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) - for _ in range(batch_size) - ] - - # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times - if do_sample is False: - beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32) - beam_scores_end = tf.ones((batch_size, num_beams - 1), dtype=tf.float32) * (-1e9) - beam_scores = tf.concat([beam_scores_begin, beam_scores_end], -1) - else: - beam_scores = tf.zeros((batch_size, num_beams), dtype=tf.float32) - - beam_scores = tf.reshape(beam_scores, (batch_size * num_beams,)) - - # cache compute states - past = encoder_outputs - - # done sentences - done = [False for _ in range(batch_size)] - - while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation( - input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache - ) - outputs = self(**model_inputs) # (batch_size * num_beams, cur_len, vocab_size) - next_token_logits = outputs[0][:, -1, :] # (batch_size * num_beams, vocab_size) - - # if model has past, then set the past variable to speed up decoding - if self._use_cache(outputs, use_cache): - past = outputs[1] - - # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) - if repetition_penalty != 1.0: - next_token_logits_penalties = _create_next_token_logits_penalties( - input_ids, next_token_logits, repetition_penalty - ) - next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) - - # Temperature (higher temperature => more likely to sample low probability tokens) - if temperature != 1.0: - next_token_logits = next_token_logits / temperature - - # calculate log softmax score - scores = tf.nn.log_softmax(next_token_logits, axis=-1) # (batch_size * num_beams, vocab_size) - - # set eos token prob to zero if min_length is not reached - if eos_token_id is not None and cur_len < min_length: - # create eos_token_id boolean mask - num_batch_hypotheses = batch_size * num_beams - - is_token_logit_eos_token = tf.convert_to_tensor( - [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool - ) - eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [num_batch_hypotheses, vocab_size]) - - scores = set_tensor_by_indices_to_value(scores, eos_token_indices_mask, -float("inf")) - - if no_repeat_ngram_size > 0: - # calculate a list of banned tokens to prevent repetitively generating the same ngrams - # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 - num_batch_hypotheses = batch_size * num_beams - banned_tokens = calc_banned_ngram_tokens( - input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len - ) - # create banned_tokens boolean mask - banned_tokens_indices_mask = [] - for banned_tokens_slice in banned_tokens: - banned_tokens_indices_mask.append( - [True if token in banned_tokens_slice else False for token in range(vocab_size)] - ) - - scores = set_tensor_by_indices_to_value( - scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") - ) - - if bad_words_ids is not None: - # calculate a list of banned tokens according to bad words - banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) - - banned_tokens_indices_mask = [] - for banned_tokens_slice in banned_tokens: - banned_tokens_indices_mask.append( - [True if token in banned_tokens_slice else False for token in range(vocab_size)] - ) - - scores = set_tensor_by_indices_to_value( - scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf") - ) - - assert shape_list(scores) == [batch_size * num_beams, vocab_size] - - if do_sample: - _scores = scores + tf.broadcast_to( - beam_scores[:, None], (batch_size * num_beams, vocab_size) - ) # (batch_size * num_beams, vocab_size) - - # Top-p/top-k filtering - _scores = tf_top_k_top_p_filtering( - _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 - ) # (batch_size * num_beams, vocab_size) - # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) - _scores = tf.reshape(_scores, (batch_size, num_beams * vocab_size)) - - next_tokens = sample_without_replacement( - _scores, num_samples=2 * num_beams - ) # (batch_size, 2 * num_beams) - # Compute next scores - next_scores = tf.gather(_scores, next_tokens, batch_dims=1) # (batch_size, 2 * num_beams) - - # sort the sampled vector to make sure that the first num_beams samples are the best - next_scores_indices = tf.argsort(next_scores, direction="DESCENDING", axis=1) - next_scores = tf.gather(next_scores, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) - next_tokens = tf.gather(next_tokens, next_scores_indices, batch_dims=1) # (batch_size, num_beams * 2) - else: - # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) - next_scores = scores + tf.broadcast_to( - beam_scores[:, None], (batch_size * num_beams, vocab_size) - ) # (batch_size * num_beams, vocab_size) - - # re-organize to group the beam together (we are keeping top hypothesis accross beams) - next_scores = tf.reshape( - next_scores, (batch_size, num_beams * vocab_size) - ) # (batch_size, num_beams * vocab_size) - - next_scores, next_tokens = tf.math.top_k(next_scores, k=2 * num_beams, sorted=True) - - assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams] - - # next batch beam content - next_batch_beam = [] - - # for each sentence - for batch_idx in range(batch_size): - - # if we are done with this sentence - if done[batch_idx]: - assert ( - len(generated_hyps[batch_idx]) >= num_beams - ), "Batch can only be done if at least {} beams have been generated".format(num_beams) - assert ( - eos_token_id is not None and pad_token_id is not None - ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" - next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch - continue - - # next sentence beam content - next_sent_beam = [] - - # next tokens for this sentence - for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( - zip(next_tokens[batch_idx], next_scores[batch_idx]) - ): - # get beam and token IDs - beam_id = beam_token_id // vocab_size - token_id = beam_token_id % vocab_size - - effective_beam_id = batch_idx * num_beams + beam_id - # add to generated hypotheses if end of sentence or last iteration - if (eos_token_id is not None) and (token_id.numpy() == eos_token_id): - # if beam_token does not belong to top num_beams tokens, it should not be added - is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams - if is_beam_token_worse_than_top_num_beams: - continue - generated_hyps[batch_idx].add( - tf.identity(input_ids[effective_beam_id]), beam_token_score.numpy() - ) - else: - # add next predicted token if it is not eos_token - next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) - - # the beam for next step is full - if len(next_sent_beam) == num_beams: - break - - # Check if we are done so that we can save a pad step if all(done) - done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( - tf.reduce_max(next_scores[batch_idx]).numpy(), cur_len - ) - - # update next beam content - assert len(next_sent_beam) == num_beams, "Beam should always be full" - next_batch_beam.extend(next_sent_beam) - assert len(next_batch_beam) == num_beams * (batch_idx + 1) - - # stop when we are done with each sentence - if all(done): - break - - # sanity check / prepare next batch - assert len(next_batch_beam) == batch_size * num_beams - beam_scores = tf.convert_to_tensor([x[0] for x in next_batch_beam], dtype=tf.float32) - beam_tokens = tf.convert_to_tensor([x[1] for x in next_batch_beam], dtype=tf.int32) - beam_idx = tf.convert_to_tensor([x[2] for x in next_batch_beam], dtype=tf.int32) - - # re-order batch and update current length - input_ids = tf.stack([tf.identity(input_ids[x, :]) for x in beam_idx]) - input_ids = tf.concat([input_ids, tf.expand_dims(beam_tokens, 1)], axis=-1) - cur_len = cur_len + 1 - - # re-order internal states - if past is not None: - past = self._reorder_cache(past, beam_idx) - - # extend attention_mask for new generated input if only decoder - if self.config.is_encoder_decoder is False: - attention_mask = tf.concat( - [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 - ) - - # finalize all open beam hypotheses and end to generated hypotheses - for batch_idx in range(batch_size): - # Add all open beam hypothesis to generated_hyps - if done[batch_idx]: - continue - # test that beam scores match previously calculated scores if not eos and batch_idx not done - if eos_token_id is not None and all( - (token_id % vocab_size).numpy().item() != eos_token_id for token_id in next_tokens[batch_idx] - ): - assert tf.reduce_all( - next_scores[batch_idx, :num_beams] == tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] - ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( - next_scores[:, :num_beams][batch_idx], tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx] - ) - - # need to add best num_beams hypotheses to generated hyps - for beam_id in range(num_beams): - effective_beam_id = batch_idx * num_beams + beam_id - final_score = beam_scores[effective_beam_id].numpy().item() - final_tokens = input_ids[effective_beam_id] - generated_hyps[batch_idx].add(final_tokens, final_score) - - # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch - output_batch_size = batch_size if do_sample else batch_size * num_return_sequences - output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences - - # select the best hypotheses - sent_lengths_list = [] - best = [] - - # retrieve best hypotheses - for i, hypotheses in enumerate(generated_hyps): - sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) - for j in range(output_num_return_sequences_per_batch): - best_hyp = sorted_hyps.pop()[1] - sent_lengths_list.append(len(best_hyp)) - best.append(best_hyp) - assert output_batch_size == len(best), "Output batch size {} must match output beam hypotheses {}".format( - output_batch_size, len(best) - ) - - sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32) - - # shorter batches are filled with pad_token - if tf.reduce_min(sent_lengths).numpy() != tf.reduce_max(sent_lengths).numpy(): - assert pad_token_id is not None, "`Pad_token_id` has to be defined" - sent_max_len = min(tf.reduce_max(sent_lengths).numpy() + 1, max_length) - decoded_list = [] - - # fill with hypothesis and eos_token_id if necessary - for i, hypo in enumerate(best): - assert sent_lengths[i] == shape_list(hypo)[0] - # if sent_length is max_len do not pad - if sent_lengths[i] == sent_max_len: - decoded_slice = hypo - else: - # else pad to sent_max_len - num_pad_tokens = sent_max_len - sent_lengths[i] - padding = pad_token_id * tf.ones((num_pad_tokens,), dtype=tf.int32) - decoded_slice = tf.concat([hypo, padding], axis=-1) - - # finish sentence with EOS token - if sent_lengths[i] < max_length: - decoded_slice = tf.where( - tf.range(sent_max_len, dtype=tf.int32) == sent_lengths[i], - eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32), - decoded_slice, - ) - # add to list - decoded_list.append(decoded_slice) - - decoded = tf.stack(decoded_list) - else: - # none of the hypotheses have an eos_token - assert (len(hypo) == max_length for hypo in best) - decoded = tf.stack(best) - - return decoded - - @staticmethod - def _reorder_cache(past, beam_idx): - return tuple(tf.gather(layer_past, beam_idx, axis=1) for layer_past in past) - - -def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty): - # create logit penalties for already seen input_ids - token_penalties = np.ones(shape_list(logits)) - prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()] - for i, prev_input_id in enumerate(prev_input_ids): - logit_penalized = logits[i].numpy()[prev_input_id] - logit_penalties = np.zeros(logit_penalized.shape) - # if previous logit score is < 0 then multiply repetition penalty else divide - logit_penalties[logit_penalized < 0] = repetition_penalty - logit_penalties[logit_penalized > 0] = 1 / repetition_penalty - np.put(token_penalties[i], prev_input_id, logit_penalties) - return tf.convert_to_tensor(token_penalties, dtype=tf.float32) - - -def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): - # Copied from fairseq for no_repeat_ngram in beam_search""" - if cur_len + 1 < no_repeat_ngram_size: - # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet - return [[] for _ in range(num_hypos)] - generated_ngrams = [{} for _ in range(num_hypos)] - for idx in range(num_hypos): - gen_tokens = prev_input_ids[idx].numpy().tolist() - generated_ngram = generated_ngrams[idx] - for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): - prev_ngram_tuple = tuple(ngram[:-1]) - generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] - - def _get_generated_ngrams(hypo_idx): - # Before decoding the next token, prevent decoding of ngrams that have already appeared - start_idx = cur_len + 1 - no_repeat_ngram_size - ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) - return generated_ngrams[hypo_idx].get(ngram_idx, []) - - banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] - return banned_tokens - - -def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): - banned_tokens = [] - - def _tokens_match(prev_tokens, tokens): - if len(tokens) == 0: - # if bad word tokens is just one token always ban it - return True - if len(tokens) > len(prev_input_ids): - # if bad word tokens are longer then prev input_ids they can't be equal - return False - - if prev_tokens[-len(tokens) :] == tokens: - # if tokens match - return True - else: - return False - - for prev_input_ids_slice in prev_input_ids: - banned_tokens_slice = [] - - for banned_token_seq in bad_words_ids: - assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format( - bad_words_ids - ) - - if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False: - # if tokens do not match continue - continue - - banned_tokens_slice.append(banned_token_seq[-1]) - - banned_tokens.append(banned_tokens_slice) - - return banned_tokens - - -def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): - """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering - Args: - logits: logits distribution shape (batch size, vocabulary size) - if top_k > 0: keep only top k tokens with highest probability (top-k filtering). - if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). - Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) - Make sure we keep at least min_tokens_to_keep per batch example in the output - From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - """ - logits_shape = shape_list(logits) - - if top_k > 0: - top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1]) # Safety check - # Remove all tokens with a probability less than the last token of the top-k - indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None] - logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) - - if top_p < 1.0: - sorted_indices = tf.argsort(logits, direction="DESCENDING") - sorted_logits = tf.gather( - logits, sorted_indices, axis=-1, batch_dims=1 - ) # expects logits to be of dim (batch_size, vocab_size) - - cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1) - - # Remove tokens with cumulative probability above the threshold (token with 0 are kept) - sorted_indices_to_remove = cumulative_probs > top_p - - if min_tokens_to_keep > 1: - # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) - sorted_indices_to_remove = tf.concat( - [ - tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]), - sorted_indices_to_remove[:, min_tokens_to_keep:], - ], - -1, - ) - - # Shift the indices to the right to keep also the first token above the threshold - sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1) - sorted_indices_to_remove = tf.concat( - [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1, - ) - # scatter sorted tensors to original indexing - indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices) - logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value) - return logits - - -def scatter_values_on_batch_indices(values, batch_indices): - shape = shape_list(batch_indices) - # broadcast batch dim to shape - broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1]) - # transform batch_indices to pair_indices - pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0)) - # scatter values to pair indices - return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape) - - -def set_tensor_by_indices_to_value(tensor, indices, value): - # create value_tensor since tensor value assignment is not possible in TF - value_tensor = tf.zeros_like(tensor) + value - return tf.where(indices, value_tensor, tensor) - - -class BeamHypotheses(object): - def __init__(self, num_beams, max_length, length_penalty, early_stopping): - """ - Initialize n-best list of hypotheses. - """ - self.max_length = max_length - 1 # ignoring bos_token - self.length_penalty = length_penalty - self.early_stopping = early_stopping - self.num_beams = num_beams - self.beams = [] - self.worst_score = 1e9 - - def __len__(self): - """ - Number of hypotheses in the list. - """ - return len(self.beams) - - def add(self, hyp, sum_logprobs): - """ - Add a new hypothesis to the list. - """ - score = sum_logprobs / len(hyp) ** self.length_penalty - if len(self) < self.num_beams or score > self.worst_score: - self.beams.append((score, hyp)) - if len(self) > self.num_beams: - sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) - del self.beams[sorted_scores[0][1]] - self.worst_score = sorted_scores[1][0] - else: - self.worst_score = min(score, self.worst_score) - - def is_done(self, best_sum_logprobs, cur_len): - """ - If there are enough hypotheses and that none of the hypotheses being generated - can become better than the worst one in the heap, then we are done with this sentence. - """ - - if len(self) < self.num_beams: - return False - elif self.early_stopping: - return True - else: - cur_score = best_sum_logprobs / cur_len ** self.length_penalty - ret = self.worst_score >= cur_score - return ret - class TFConv1D(tf.keras.layers.Layer): def __init__(self, nf, nx, initializer_range=0.02, **kwargs): @@ -1798,17 +752,6 @@ def shape_list(x): return [dynamic[i] if s is None else s for i, s in enumerate(static)] -def sample_without_replacement(logits, num_samples): - """ - categorical sampling witouth replacement is currently not implemented - the gumbel-max trick will do for now - see https://github.com/tensorflow/tensorflow/issues/9260 for more info - """ - z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1)) - _, indices = tf.nn.top_k(logits + z, num_samples) - return indices - - def get_initializer(initializer_range=0.02): """Creates a `tf.initializers.truncated_normal` with the given range. Args: diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 91a167a837..5b000c8125 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -17,7 +17,7 @@ import inspect import logging import os -from typing import Callable, Dict, Iterable, List, Optional, Tuple +from typing import Callable, Dict, List, Optional, Tuple import torch from torch import Tensor, device, dtype, nn @@ -35,6 +35,7 @@ from .file_utils import ( hf_bucket_url, is_remote_url, ) +from .generation_utils import GenerationMixin logger = logging.getLogger(__name__) @@ -261,7 +262,7 @@ class ModuleUtilsMixin: return head_mask -class PreTrainedModel(nn.Module, ModuleUtilsMixin): +class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): r""" Base class for all models. :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models @@ -801,967 +802,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin): return model - def prepare_inputs_for_generation(self, input_ids, **kwargs): - return {"input_ids": input_ids} - - def adjust_logits_during_generation(self, logits, **kwargs): - return logits - - def _use_cache(self, outputs, use_cache): - """During generation, decide whether to pass the `past` variable to the next forward pass.""" - if len(outputs) <= 1 or use_cache is False: - return False - if hasattr(self.config, "mem_len") and self.config.mem_len == 0: - return False - return True - - def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty): - """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """ - for i in range(batch_size * num_beams): - for previous_token in set(prev_output_tokens[i].tolist()): - # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability - if lprobs[i, previous_token] < 0: - lprobs[i, previous_token] *= repetition_penalty - else: - lprobs[i, previous_token] /= repetition_penalty - - def postprocess_next_token_scores( - self, - scores, - input_ids, - no_repeat_ngram_size, - bad_words_ids, - cur_len, - min_length, - max_length, - eos_token_id, - repetition_penalty, - batch_size, - num_beams, - ): - # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) - if repetition_penalty != 1.0: - self.enforce_repetition_penalty_( - scores, batch_size, num_beams, input_ids, repetition_penalty, - ) - - # set eos token prob to zero if min_length is not reached - if eos_token_id is not None and cur_len < min_length: - scores[:, eos_token_id] = -float("inf") - - if no_repeat_ngram_size > 0: - # calculate a list of banned tokens to prevent repetitively generating the same ngrams - num_batch_hypotheses = batch_size * num_beams - # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 - banned_batch_tokens = calc_banned_ngram_tokens( - input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len - ) - for i, banned_tokens in enumerate(banned_batch_tokens): - scores[i, banned_tokens] = -float("inf") - - if bad_words_ids is not None: - # calculate a list of banned tokens according to bad words - banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) - - for i, banned_tokens in enumerate(banned_tokens): - scores[i, banned_tokens] = -float("inf") - - return scores - - @torch.no_grad() - def generate( - self, - input_ids: Optional[torch.LongTensor] = None, - max_length: Optional[int] = None, - min_length: Optional[int] = None, - do_sample: Optional[bool] = None, - early_stopping: Optional[bool] = None, - num_beams: Optional[int] = None, - temperature: Optional[float] = None, - top_k: Optional[int] = None, - top_p: Optional[float] = None, - repetition_penalty: Optional[float] = None, - bad_words_ids: Optional[Iterable[int]] = None, - bos_token_id: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - length_penalty: Optional[float] = None, - no_repeat_ngram_size: Optional[int] = None, - num_return_sequences: Optional[int] = None, - attention_mask: Optional[torch.LongTensor] = None, - decoder_start_token_id: Optional[int] = None, - use_cache: Optional[bool] = None, - **model_specific_kwargs - ) -> torch.LongTensor: - r""" Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. - - Adapted in part from `Facebook's XLM beam search code`_. - - .. _`Facebook's XLM beam search code`: - https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 - - - Parameters: - - input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)` - The sequence used as a prompt for the generation. If `None` the method initializes - it as an empty `torch.LongTensor` of shape `(1,)`. - - max_length: (`optional`) int - The max length of the sequence to be generated. Between `min_length` and infinity. Default to 20. - - min_length: (`optional`) int - The min length of the sequence to be generated. Between 0 and infinity. Default to 0. - - do_sample: (`optional`) bool - If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. - - early_stopping: (`optional`) bool - if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. - - num_beams: (`optional`) int - Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. - - temperature: (`optional`) float - The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. - - top_k: (`optional`) int - The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. - - top_p: (`optional`) float - The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. - - repetition_penalty: (`optional`) float - The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. - - pad_token_id: (`optional`) int - Padding token. Default to specicic model pad_token_id or None if it does not exist. - - bos_token_id: (`optional`) int - BOS token. Defaults to `bos_token_id` as defined in the models config. - - eos_token_id: (`optional`) int - EOS token. Defaults to `eos_token_id` as defined in the models config. - - length_penalty: (`optional`) float - Exponential penalty to the length. Default to 1. - - no_repeat_ngram_size: (`optional`) int - If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. - bad_words_ids: (`optional`) list of lists of int - `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. - - num_return_sequences: (`optional`) int - The number of independently computed returned sequences for each element in the batch. Default to 1. - - attention_mask (`optional`) obj: `torch.LongTensor` of same shape as `input_ids` - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - Defaults to `None`. - - `What are attention masks? <../glossary.html#attention-mask>`__ - - decoder_start_token_id=None: (`optional`) int - Start token id for the decoder. Defaults to ``decoder_start_token_id`` as defined the model's config or to the ``bos_token_id`` - if no ``decoder_start_token_id`` is found in the config. - This is only relevant for encoder-decoder models. - - use_cache: (`optional`) bool - If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`. - - model_specific_kwargs: (`optional`) dict - Additional model specific kwargs will be forwarded to the `forward` function of the model. - - Return: - - output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)` - sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` - - Examples:: - - from transformers import AutoTokenizer, AutoModelForCausalLM - - tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer - model = AutoModelForCausalLM.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. - outputs = model.generate(max_length=40) # do greedy decoding - print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) - - tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer - model = AutoModelForCausalLM.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. - input_context = 'The dog' - input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context - outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' - for i in range(3): # 3 output sequences were generated - print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) - - tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer - model = AutoModelForCausalLM.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. - input_context = 'The dog' - input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context - outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True) # 3 generate sequences using by sampling - for i in range(3): # 3 output sequences were generated - print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) - - tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer - model = AutoModelForCausalLM.from_pretrained('ctrl') # Download model and configuration from S3 and cache. - input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl - input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context - outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences - print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) - - tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer - model = AutoModelForCausalLM.from_pretrained('gpt2') # Download model and configuration from S3 and cache. - input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl - bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] - input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context - outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated - """ - - # We cannot generate if the model does not have a LM head - if self.get_output_embeddings() is None: - raise AttributeError( - "You tried to generate sequences with a model that does not have a LM Head." - "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )" - ) - - max_length = max_length if max_length is not None else self.config.max_length - min_length = min_length if min_length is not None else self.config.min_length - do_sample = do_sample if do_sample is not None else self.config.do_sample - early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping - use_cache = use_cache if use_cache is not None else self.config.use_cache - num_beams = num_beams if num_beams is not None else self.config.num_beams - temperature = temperature if temperature is not None else self.config.temperature - top_k = top_k if top_k is not None else self.config.top_k - top_p = top_p if top_p is not None else self.config.top_p - repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty - bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id - pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id - length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty - no_repeat_ngram_size = ( - no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size - ) - bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids - num_return_sequences = ( - num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences - ) - decoder_start_token_id = ( - decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id - ) - - if input_ids is not None: - batch_size = input_ids.shape[0] # overriden by the input batch_size - else: - batch_size = 1 - - assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." - assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." - assert isinstance(do_sample, bool), "`do_sample` should be a boolean." - assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." - assert isinstance(use_cache, bool), "`use_cache` should be a boolean." - assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." - assert temperature > 0, "`temperature` should be strictly positive." - assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." - assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." - assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." - assert input_ids is not None or ( - isinstance(bos_token_id, int) and bos_token_id >= 0 - ), "If input_ids is not defined, `bos_token_id` should be a positive integer." - assert pad_token_id is None or ( - isinstance(pad_token_id, int) and (pad_token_id >= 0) - ), "`pad_token_id` should be a positive integer." - assert (eos_token_id is None) or ( - isinstance(eos_token_id, int) and (eos_token_id >= 0) - ), "`eos_token_id` should be a positive integer." - assert length_penalty > 0, "`length_penalty` should be strictly positive." - assert ( - isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0 - ), "`no_repeat_ngram_size` should be a positive integer." - assert ( - isinstance(num_return_sequences, int) and num_return_sequences > 0 - ), "`num_return_sequences` should be a strictly positive integer." - assert ( - bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) - ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" - - if input_ids is None: - assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( - "you should either supply a context to complete as `input_ids` input " - "or a `bos_token_id` (integer >= 0) as a first token to start the generation." - ) - input_ids = torch.full( - (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device, - ) - else: - assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)." - - # not allow to duplicate outputs when greedy decoding - if do_sample is False: - if num_beams == 1: - # no_beam_search greedy generation conditions - assert ( - num_return_sequences == 1 - ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" - - else: - # beam_search greedy generation conditions - assert ( - num_beams >= num_return_sequences - ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" - - # create attention mask if necessary - # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 - if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids): - attention_mask = input_ids.ne(pad_token_id).long() - elif attention_mask is None: - attention_mask = input_ids.new_ones(input_ids.shape) - - # set pad_token_id to eos_token_id if not set. Important that this is done after - # attention_mask is created - if pad_token_id is None and eos_token_id is not None: - logger.warning( - "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) - ) - pad_token_id = eos_token_id - - # current position and vocab size - if hasattr(self.config, "vocab_size"): - vocab_size = self.config.vocab_size - elif ( - self.config.is_encoder_decoder - and hasattr(self.config, "decoder") - and hasattr(self.config.decoder, "vocab_size") - ): - vocab_size = self.config.decoder.vocab_size - - # set effective batch size and effective batch multiplier according to do_sample - if do_sample: - effective_batch_size = batch_size * num_return_sequences - effective_batch_mult = num_return_sequences - else: - effective_batch_size = batch_size - effective_batch_mult = 1 - - if self.config.is_encoder_decoder: - if decoder_start_token_id is None: - decoder_start_token_id = bos_token_id - - assert ( - decoder_start_token_id is not None - ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" - assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) - assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) - - # get encoder and store encoder outputs - encoder = self.get_encoder() - - encoder_outputs: tuple = encoder(input_ids, attention_mask=attention_mask) - - # Expand input ids if num_beams > 1 or num_return_sequences > 1 - if num_return_sequences > 1 or num_beams > 1: - input_ids_len = input_ids.shape[-1] - input_ids = input_ids.unsqueeze(1).expand(batch_size, effective_batch_mult * num_beams, input_ids_len) - attention_mask = attention_mask.unsqueeze(1).expand( - batch_size, effective_batch_mult * num_beams, input_ids_len - ) - - input_ids = input_ids.contiguous().view( - effective_batch_size * num_beams, input_ids_len - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) - attention_mask = attention_mask.contiguous().view( - effective_batch_size * num_beams, input_ids_len - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) - - if self.config.is_encoder_decoder: - # create empty decoder_input_ids - input_ids = torch.full( - (effective_batch_size * num_beams, 1), - decoder_start_token_id, - dtype=torch.long, - device=next(self.parameters()).device, - ) - cur_len = 1 - - assert ( - batch_size == encoder_outputs[0].shape[0] - ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " - - # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) - expanded_batch_idxs = ( - torch.arange(batch_size) - .view(-1, 1) - .repeat(1, num_beams * effective_batch_mult) - .view(-1) - .to(input_ids.device) - ) - # expand encoder_outputs - encoder_outputs = (encoder_outputs[0].index_select(0, expanded_batch_idxs), *encoder_outputs[1:]) - - else: - encoder_outputs = None - cur_len = input_ids.shape[-1] - - if num_beams > 1: - output = self._generate_beam_search( - input_ids, - cur_len=cur_len, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - early_stopping=early_stopping, - temperature=temperature, - top_k=top_k, - top_p=top_p, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - batch_size=effective_batch_size, - num_return_sequences=num_return_sequences, - length_penalty=length_penalty, - num_beams=num_beams, - vocab_size=vocab_size, - encoder_outputs=encoder_outputs, - attention_mask=attention_mask, - use_cache=use_cache, - model_specific_kwargs=model_specific_kwargs, - ) - else: - output = self._generate_no_beam_search( - input_ids, - cur_len=cur_len, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - temperature=temperature, - top_k=top_k, - top_p=top_p, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - batch_size=effective_batch_size, - encoder_outputs=encoder_outputs, - attention_mask=attention_mask, - use_cache=use_cache, - model_specific_kwargs=model_specific_kwargs, - ) - - return output - - def _generate_no_beam_search( - self, - input_ids, - cur_len, - max_length, - min_length, - do_sample, - temperature, - top_k, - top_p, - repetition_penalty, - no_repeat_ngram_size, - bad_words_ids, - pad_token_id, - eos_token_id, - batch_size, - encoder_outputs, - attention_mask, - use_cache, - model_specific_kwargs, - ): - """ Generate sequences for each example without beam search (num_beams == 1). - All returned sequence are generated independantly. - """ - # length of generated sentences / unfinished sentences - unfinished_sents = input_ids.new(batch_size).fill_(1) - sent_lengths = input_ids.new(batch_size).fill_(max_length) - - past = (encoder_outputs, None) if encoder_outputs is not None else None - - while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation( - input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs - ) - - outputs = self(**model_inputs) - next_token_logits = outputs[0][:, -1, :] - - scores = self.postprocess_next_token_scores( - scores=next_token_logits, - input_ids=input_ids, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - cur_len=cur_len, - min_length=min_length, - max_length=max_length, - eos_token_id=eos_token_id, - repetition_penalty=repetition_penalty, - batch_size=batch_size, - num_beams=1, - ) - - # if model has past, then set the past variable to speed up decoding - if self._use_cache(outputs, use_cache): - past = outputs[1] - - if do_sample: - # Temperature (higher temperature => more likely to sample low probability tokens) - if temperature != 1.0: - scores = scores / temperature - # Top-p/top-k filtering - next_token_logscores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p) - # Sample - probs = F.softmax(next_token_logscores, dim=-1) - next_token = torch.multinomial(probs, num_samples=1).squeeze(1) - else: - # Greedy decoding - next_token = torch.argmax(next_token_logits, dim=-1) - - # update generations and finished sentences - if eos_token_id is not None: - # pad finished sentences if eos_token_id exist - tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) - else: - tokens_to_add = next_token - - # add token and increase length by one - input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1) - cur_len = cur_len + 1 - - if eos_token_id is not None: - eos_in_sents = tokens_to_add == eos_token_id - # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length - is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool() - sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len) - # unfinished_sents is set to zero if eos in sentence - unfinished_sents.mul_((~eos_in_sents).long()) - - # stop when there is a in each sentence, or if we exceed the maximul length - if unfinished_sents.max() == 0: - break - - # extend attention_mask for new generated input if only decoder - if self.config.is_encoder_decoder is False: - attention_mask = torch.cat( - [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 - ) - - return input_ids - - def _generate_beam_search( - self, - input_ids, - cur_len, - max_length, - min_length, - do_sample, - early_stopping, - temperature, - top_k, - top_p, - repetition_penalty, - no_repeat_ngram_size, - bad_words_ids, - pad_token_id, - eos_token_id, - batch_size, - num_return_sequences, - length_penalty, - num_beams, - vocab_size, - encoder_outputs, - attention_mask, - use_cache, - model_specific_kwargs, - ): - """ Generate sequences for each example with beam search. - """ - - # generated hypotheses - generated_hyps = [ - BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) - for _ in range(batch_size) - ] - - # scores for each sentence in the beam - beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) - - # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times - if do_sample is False: - beam_scores[:, 1:] = -1e9 - beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) - - # cache compute states - past = (encoder_outputs, None) if encoder_outputs is not None else None - - # done sentences - done = [False for _ in range(batch_size)] - - while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation( - input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs - ) - outputs = self(**model_inputs) # (batch_size * num_beams, cur_len, vocab_size) - next_token_logits = outputs[0][:, -1, :] # (batch_size * num_beams, vocab_size) - - # if model has past, then set the past variable to speed up decoding - if self._use_cache(outputs, use_cache): - past = outputs[1] - if self.config.is_encoder_decoder and do_sample is False: - # TODO (PVP) still a bit hacky here - there might be a better solution - next_token_logits = self.adjust_logits_during_generation( - next_token_logits, cur_len=cur_len, max_length=max_length - ) - - scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size) - - scores = self.postprocess_next_token_scores( - scores=scores, - input_ids=input_ids, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - cur_len=cur_len, - min_length=min_length, - max_length=max_length, - eos_token_id=eos_token_id, - repetition_penalty=repetition_penalty, - batch_size=batch_size, - num_beams=num_beams, - ) - - assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format( - scores.shape, (batch_size * num_beams, vocab_size) - ) - - if do_sample: - _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) - # Temperature - if temperature != 1.0: - _scores = _scores / temperature - # Top-p/top-k filtering - _scores = top_k_top_p_filtering( - _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 - ) # (batch_size * num_beams, vocab_size) - # re-organize to group the beam together to sample from all beam_idxs - _scores = _scores.contiguous().view( - batch_size, num_beams * vocab_size - ) # (batch_size, num_beams * vocab_size) - - # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) - probs = F.softmax(_scores, dim=-1) - next_tokens = torch.multinomial(probs, num_samples=2 * num_beams) # (batch_size, num_beams * 2) - # Compute next scores - next_scores = torch.gather(_scores, -1, next_tokens) # (batch_size, num_beams * 2) - # sort the sampled vector to make sure that the first num_beams samples are the best - next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1) - next_tokens = torch.gather(next_tokens, -1, next_scores_indices) # (batch_size, num_beams * 2) - - else: - next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) - - # re-organize to group the beam together (we are keeping top hypothesis accross beams) - next_scores = next_scores.view( - batch_size, num_beams * vocab_size - ) # (batch_size, num_beams * vocab_size) - - next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True) - - assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams) - - # next batch beam content - next_batch_beam = [] - - # for each sentence - for batch_idx in range(batch_size): - - # if we are done with this sentence, add a pad token - if done[batch_idx]: - assert ( - len(generated_hyps[batch_idx]) >= num_beams - ), "Batch can only be done if at least {} beams have been generated".format(num_beams) - assert ( - eos_token_id is not None and pad_token_id is not None - ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" - next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch - continue - - # next sentence beam content, this will get added to next_batch_beam - next_sent_beam = [] - - # next tokens for this sentence - for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( - zip(next_tokens[batch_idx], next_scores[batch_idx]) - ): - # get beam and token IDs - beam_id = beam_token_id // vocab_size - token_id = beam_token_id % vocab_size - - effective_beam_id = batch_idx * num_beams + beam_id - # add to generated hypotheses if end of sentence - if (eos_token_id is not None) and (token_id.item() == eos_token_id): - # if beam_token does not belong to top num_beams tokens, it should not be added - is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams - if is_beam_token_worse_than_top_num_beams: - continue - generated_hyps[batch_idx].add( - input_ids[effective_beam_id].clone(), beam_token_score.item(), - ) - else: - # add next predicted token since it is not eos_token - next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) - - # once the beam for next step is full, don't add more tokens to it. - if len(next_sent_beam) == num_beams: - break - - # Check if we are done so that we can save a pad step if all(done) - done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( - next_scores[batch_idx].max().item(), cur_len - ) - - # update next beam content - assert len(next_sent_beam) == num_beams, "Beam should always be full" - next_batch_beam.extend(next_sent_beam) - assert len(next_batch_beam) == num_beams * (batch_idx + 1), "We should have added num_beams each step" - - # stop when we are done with each sentence - if all(done): - break - - # sanity check / prepare next batch - assert len(next_batch_beam) == batch_size * num_beams - beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) - beam_tokens = input_ids.new([x[1] for x in next_batch_beam]) - beam_idx = input_ids.new([x[2] for x in next_batch_beam]) - - # re-order batch and update current length - input_ids = input_ids[beam_idx, :] - input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1) - cur_len = cur_len + 1 - - # re-order internal states - if past is not None: - past = self._reorder_cache(past, beam_idx) - - # extend attention_mask for new generated input if only decoder - if self.config.is_encoder_decoder is False: - attention_mask = torch.cat( - [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 - ) - - # finalize all open beam hypotheses and add to generated hypotheses - for batch_idx in range(batch_size): - if done[batch_idx]: - continue - - # test that beam scores match previously calculated scores if not eos and batch_idx not done - if eos_token_id is not None and all( - (token_id % vocab_size).item() != eos_token_id for token_id in next_tokens[batch_idx] - ): - assert torch.all( - next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx] - ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( - next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx], - ) - - # need to add best num_beams hypotheses to generated hyps - for beam_id in range(num_beams): - effective_beam_id = batch_idx * num_beams + beam_id - final_score = beam_scores[effective_beam_id].item() - final_tokens = input_ids[effective_beam_id] - generated_hyps[batch_idx].add(final_tokens, final_score) - - # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch - output_batch_size = batch_size if do_sample else batch_size * num_return_sequences - output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences - - # select the best hypotheses - sent_lengths = input_ids.new(output_batch_size) - best = [] - - # retrieve best hypotheses - for i, hypotheses in enumerate(generated_hyps): - sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) - for j in range(output_num_return_sequences_per_batch): - effective_batch_idx = output_num_return_sequences_per_batch * i + j - best_hyp = sorted_hyps.pop()[1] - sent_lengths[effective_batch_idx] = len(best_hyp) - best.append(best_hyp) - - # shorter batches are padded - if sent_lengths.min().item() != sent_lengths.max().item(): - assert pad_token_id is not None, "`Pad_token_id` has to be defined" - sent_max_len = min(sent_lengths.max().item() + 1, max_length) - decoded = input_ids.new(output_batch_size, sent_max_len).fill_(pad_token_id) - - # fill with hypothesis and eos_token_id if necessary - for i, hypo in enumerate(best): - decoded[i, : sent_lengths[i]] = hypo - if sent_lengths[i] < max_length: - decoded[i, sent_lengths[i]] = eos_token_id - else: - # none of the hypotheses have an eos_token - assert (len(hypo) == max_length for hypo in best) - decoded = torch.stack(best).type(torch.long).to(next(self.parameters()).device) - - return decoded - - @staticmethod - def _reorder_cache(past: Tuple, beam_idx: Tensor) -> Tuple[Tensor]: - return tuple(layer_past.index_select(1, beam_idx) for layer_past in past) - - -def calc_banned_ngram_tokens(prev_input_ids: Tensor, num_hypos: int, no_repeat_ngram_size: int, cur_len: int) -> None: - """Copied from fairseq for no_repeat_ngram in beam_search""" - if cur_len + 1 < no_repeat_ngram_size: - # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet - return [[] for _ in range(num_hypos)] - generated_ngrams = [{} for _ in range(num_hypos)] - for idx in range(num_hypos): - gen_tokens = prev_input_ids[idx].tolist() - generated_ngram = generated_ngrams[idx] - for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): - prev_ngram_tuple = tuple(ngram[:-1]) - generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] - - def _get_generated_ngrams(hypo_idx): - # Before decoding the next token, prevent decoding of ngrams that have already appeared - start_idx = cur_len + 1 - no_repeat_ngram_size - ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].tolist()) - return generated_ngrams[hypo_idx].get(ngram_idx, []) - - banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] - return banned_tokens - - -def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iterable[int]) -> Iterable[int]: - banned_tokens = [] - - def _tokens_match(prev_tokens, tokens): - if len(tokens) == 0: - # if bad word tokens is just one token always ban it - return True - if len(tokens) > len(prev_input_ids): - # if bad word tokens are longer then prev input_ids they can't be equal - return False - - if prev_tokens[-len(tokens) :] == tokens: - # if tokens match - return True - else: - return False - - for prev_input_ids_slice in prev_input_ids: - banned_tokens_slice = [] - - for banned_token_seq in bad_words_ids: - assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format( - bad_words_ids - ) - - if _tokens_match(prev_input_ids_slice.tolist(), banned_token_seq[:-1]) is False: - # if tokens do not match continue - continue - - banned_tokens_slice.append(banned_token_seq[-1]) - - banned_tokens.append(banned_tokens_slice) - - return banned_tokens - - -def top_k_top_p_filtering( - logits: Tensor, - top_k: int = 0, - top_p: float = 1.0, - filter_value: float = -float("Inf"), - min_tokens_to_keep: int = 1, -) -> Tensor: - """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering - Args: - logits: logits distribution shape (batch size, vocabulary size) - if top_k > 0: keep only top k tokens with highest probability (top-k filtering). - if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). - Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) - Make sure we keep at least min_tokens_to_keep per batch example in the output - From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - """ - if top_k > 0: - top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check - # Remove all tokens with a probability less than the last token of the top-k - indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] - logits[indices_to_remove] = filter_value - - if top_p < 1.0: - sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) - - # Remove tokens with cumulative probability above the threshold (token with 0 are kept) - sorted_indices_to_remove = cumulative_probs > top_p - if min_tokens_to_keep > 1: - # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) - sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 - # Shift the indices to the right to keep also the first token above the threshold - sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() - sorted_indices_to_remove[..., 0] = 0 - - # scatter sorted tensors to original indexing - indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) - logits[indices_to_remove] = filter_value - return logits - - -class BeamHypotheses(object): - def __init__(self, num_beams, max_length, length_penalty, early_stopping): - """ - Initialize n-best list of hypotheses. - """ - self.max_length = max_length - 1 # ignoring bos_token - self.length_penalty = length_penalty - self.early_stopping = early_stopping - self.num_beams = num_beams - self.beams = [] - self.worst_score = 1e9 - - def __len__(self): - """ - Number of hypotheses in the list. - """ - return len(self.beams) - - def add(self, hyp, sum_logprobs): - """ - Add a new hypothesis to the list. - """ - score = sum_logprobs / len(hyp) ** self.length_penalty - if len(self) < self.num_beams or score > self.worst_score: - self.beams.append((score, hyp)) - if len(self) > self.num_beams: - sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) - del self.beams[sorted_scores[0][1]] - self.worst_score = sorted_scores[1][0] - else: - self.worst_score = min(score, self.worst_score) - - def is_done(self, best_sum_logprobs, cur_len): - """ - If there are enough hypotheses and that none of the hypotheses being generated - can become better than the worst one in the heap, then we are done with this sentence. - """ - - if len(self) < self.num_beams: - return False - elif self.early_stopping: - return True - else: - cur_score = best_sum_logprobs / cur_len ** self.length_penalty - ret = self.worst_score >= cur_score - return ret - class Conv1D(nn.Module): def __init__(self, nf, nx):