From f51ac9e059a78049362803c1d606a2c6a8160ee4 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Thu, 17 Oct 2024 16:53:48 +0100 Subject: [PATCH] Generate: visit non-llm `prepare_inputs_for_generation` (#34199) * tmp * all visited * test all * Update src/transformers/models/moshi/modeling_moshi.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * delete another one :D --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/generation/utils.py | 30 ++-- src/transformers/models/bark/modeling_bark.py | 1 + .../modeling_bigbird_pegasus.py | 17 -- .../models/bloom/modeling_bloom.py | 1 + .../models/chameleon/modeling_chameleon.py | 3 + .../models/codegen/modeling_codegen.py | 1 + .../models/cohere/modeling_cohere.py | 1 + src/transformers/models/dbrx/modeling_dbrx.py | 1 + .../modeling_encoder_decoder.py | 17 +- .../models/falcon/modeling_falcon.py | 1 + src/transformers/models/fuyu/modeling_fuyu.py | 2 + .../models/gemma/modeling_gemma.py | 1 + .../models/gemma2/modeling_gemma2.py | 1 + src/transformers/models/git/modeling_git.py | 2 + .../models/gpt_neo/modeling_gpt_neo.py | 1 + .../models/gpt_neox/modeling_gpt_neox.py | 1 + .../modeling_gpt_neox_japanese.py | 1 + src/transformers/models/gptj/modeling_gptj.py | 1 + .../models/granite/modeling_granite.py | 1 + .../models/granitemoe/modeling_granitemoe.py | 1 + .../models/idefics/modeling_idefics.py | 3 + .../models/idefics2/modeling_idefics2.py | 3 + .../models/idefics3/modeling_idefics3.py | 3 + .../models/jetmoe/modeling_jetmoe.py | 1 + .../models/kosmos2/modeling_kosmos2.py | 2 + .../models/llama/modeling_llama.py | 1 + .../models/llava/modeling_llava.py | 2 + .../models/llava_next/modeling_llava_next.py | 2 + .../modeling_llava_next_video.py | 2 + .../modular_llava_next_video.py | 2 + .../modeling_llava_onevision.py | 2 + .../models/mistral/modeling_mistral.py | 72 -------- .../models/mixtral/modeling_mixtral.py | 70 -------- .../models/mllama/modeling_mllama.py | 3 + .../models/moshi/modeling_moshi.py | 154 ++---------------- .../models/musicgen/modeling_musicgen.py | 2 + .../modeling_musicgen_melody.py | 2 + .../models/nemotron/modeling_nemotron.py | 1 + src/transformers/models/olmo/modeling_olmo.py | 1 + .../models/olmoe/modeling_olmoe.py | 1 + .../models/paligemma/modeling_paligemma.py | 27 +-- .../models/persimmon/modeling_persimmon.py | 1 + src/transformers/models/phi/modeling_phi.py | 1 + src/transformers/models/phi3/modeling_phi3.py | 67 ++------ .../models/phimoe/modeling_phimoe.py | 67 ++------ .../models/pix2struct/modeling_pix2struct.py | 43 ----- .../models/pop2piano/modeling_pop2piano.py | 27 --- .../models/qwen2/modeling_qwen2.py | 73 --------- .../qwen2_audio/modeling_qwen2_audio.py | 7 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 73 --------- .../models/qwen2_vl/modeling_qwen2_vl.py | 2 + .../seamless_m4t/modeling_seamless_m4t.py | 132 --------------- .../modeling_seamless_m4t_v2.py | 114 ------------- .../modeling_speech_encoder_decoder.py | 17 +- .../speech_to_text/modeling_speech_to_text.py | 27 --- .../models/speecht5/modeling_speecht5.py | 2 + .../models/stablelm/modeling_stablelm.py | 1 + .../models/starcoder2/modeling_starcoder2.py | 73 --------- src/transformers/models/udop/modeling_udop.py | 30 ---- .../video_llava/modeling_video_llava.py | 2 + .../models/vipllava/modeling_vipllava.py | 2 + .../modeling_vision_encoder_decoder.py | 17 +- .../models/whisper/modeling_whisper.py | 45 +---- utils/check_copies.py | 10 +- 64 files changed, 140 insertions(+), 1134 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 86ea702dd9..9ede527ecb 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -390,13 +390,16 @@ class GenerationMixin: # 3. Prepare base model inputs input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and not self.config.is_encoder_decoder and cache_position[0] == 0: - model_inputs[input_ids_key] = None - model_inputs["inputs_embeds"] = inputs_embeds + if not self.config.is_encoder_decoder: + if inputs_embeds is not None and cache_position[0] == 0: + model_inputs[input_ids_key] = None + model_inputs["inputs_embeds"] = inputs_embeds + else: + # `clone` calls in this function ensure a consistent stride. See #32227 + model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format) + model_inputs["inputs_embeds"] = None else: - # `clone` calls in this function ensure a consistent stride. See #32227 model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format) - model_inputs["inputs_embeds"] = None # 4. Create missing `position_ids` on the fly if ( @@ -428,10 +431,15 @@ class GenerationMixin: # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create # the 4D causal mask exists, it should be present in the base model (XXXModel class). - base_model = getattr(self, self.base_model_prefix) - causal_mask_creation_function = getattr( - base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None - ) + base_model = getattr(self, self.base_model_prefix, None) + if base_model is None: + causal_mask_creation_function = getattr( + self, "_prepare_4d_causal_attention_mask_with_cache_position", None + ) + else: + causal_mask_creation_function = getattr( + base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None + ) if causal_mask_creation_function is None: logger.warning_once( f"{self.__class__.__name__} has no `_prepare_4d_causal_attention_mask_with_cache_position` method " @@ -444,10 +452,12 @@ class GenerationMixin: attention_mask, sequence_length=sequence_length, target_length=past_key_values.get_max_cache_shape(), - dtype=self.get_output_embeddings().weight.dtype, + dtype=self.dtype, device=device, cache_position=cache_position, batch_size=batch_size, + config=self.config, + past_key_values=past_key_values, ) if attention_mask is not None: model_inputs["attention_mask"] = attention_mask diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index ea42048237..f1c77367e5 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -578,6 +578,7 @@ class BarkCausalModel(BarkPreTrainedModel, GenerationMixin): self.input_embeds_layer = new_embeddings def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): + # Overwritten -- bark has a model-specific hack input_embeds = kwargs.get("input_embeds", None) attention_mask = kwargs.get("attention_mask", None) diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 19540a7498..520e7dab1f 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -3020,23 +3020,6 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel, GenerationMixin): cross_attentions=outputs.cross_attentions, ) - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs - ): - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - if attention_mask is None: - attention_mask = input_ids.new_ones(input_ids.shape) - - if past_key_values: - input_ids = input_ids[:, -1:] - # first step, decoder_cached_states are empty - return { - "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed - "attention_mask": attention_mask, - "past_key_values": past_key_values, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index 75f8e5830f..b0e9a4bbcb 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -806,6 +806,7 @@ class BloomModel(BloomPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 20dbfc317e..d0b964a7a6 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1451,6 +1451,7 @@ class ChameleonModel(ChameleonPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -1644,6 +1645,8 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi use_cache=True, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index 478745b2c5..616c93a46e 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -649,6 +649,7 @@ class CodeGenModel(CodeGenPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index a5d3721f5b..3c14a6d28d 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -1017,6 +1017,7 @@ class CohereModel(CoherePreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index ef81e43d02..659fa154ec 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -1179,6 +1179,7 @@ class DbrxModel(DbrxPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py index 359a4eabcf..d1029160dd 100644 --- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py @@ -26,6 +26,7 @@ from torch import nn from torch.nn import CrossEntropyLoss from ...configuration_utils import PretrainedConfig +from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput from ...modeling_utils import PreTrainedModel from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings @@ -166,7 +167,7 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start @add_start_docstrings(ENCODER_DECODER_START_DOCSTRING) -class EncoderDecoderModel(PreTrainedModel): +class EncoderDecoderModel(PreTrainedModel, GenerationMixin): r""" [`EncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one of the base model classes of the library as encoder and another one as decoder when created with the @@ -666,20 +667,6 @@ class EncoderDecoderModel(PreTrainedModel): def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs - ): - decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values) - input_dict = { - "attention_mask": attention_mask, - "decoder_attention_mask": decoder_inputs.get("attention_mask"), - "decoder_input_ids": decoder_inputs["input_ids"], - "encoder_outputs": encoder_outputs, - "past_key_values": decoder_inputs.get("past_key_values"), - "use_cache": use_cache, - } - return input_dict - def resize_token_embeddings(self, *args, **kwargs): raise NotImplementedError( "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the" diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index f48accab44..504dcf10b2 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -1188,6 +1188,7 @@ class FalconModel(FalconPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py index d2fd197073..c8c758e688 100644 --- a/src/transformers/models/fuyu/modeling_fuyu.py +++ b/src/transformers/models/fuyu/modeling_fuyu.py @@ -344,6 +344,8 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin): image_patches_indices=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + if past_key_values: input_ids = input_ids[:, -1:] diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index ff206a470b..f164c4add1 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -933,6 +933,7 @@ class GemmaModel(GemmaPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 0b99aa59c6..8f7e7364b5 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -905,6 +905,7 @@ class Gemma2Model(Gemma2PreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index c7f9ceafe1..0b86a41378 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -1609,6 +1609,8 @@ class GitForCausalLM(GitPreTrainedModel, GenerationMixin): def prepare_inputs_for_generation( self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs ): + # Overwritten -- `git` has special cache handling and doesn't support generating from `inputs_embeds` atm + # cut decoder_input_ids if past_key_values is used if past_key_values is not None: past_length = past_key_values.get_seq_length() diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 7bba7608e6..28bfbabc1f 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -863,6 +863,7 @@ class GPTNeoModel(GPTNeoPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index f4636db0a9..359996983e 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -1060,6 +1060,7 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index b618f531e5..6c3f3313f5 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -764,6 +764,7 @@ class GPTNeoXJapaneseModel(GPTNeoXJapanesePreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 5c80485823..1cc9cf369d 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -958,6 +958,7 @@ class GPTJModel(GPTJPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index 0eb27d452f..bb8c157df3 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -958,6 +958,7 @@ class GraniteModel(GranitePreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index ebdea826fa..f3e2d67734 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -1194,6 +1194,7 @@ class GraniteMoeModel(GraniteMoePreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 4dd5f36a93..bc98374455 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1441,6 +1441,7 @@ class IdeficsModel(IdeficsPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -1674,6 +1675,8 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel, GenerationMixin): use_cache=None, **kwargs, ): + # Overwritten -- custom processing based on `config.use_resampler` + model_inputs = {} if image_hidden_states is not None: if self.config.use_resampler: diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index d34e0acde4..daa8bfb055 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1665,6 +1665,9 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin) num_logits_to_keep=None, **kwargs, ): + # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take + # precedence is moved to the model, we can remove this fn) + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens if past_key_values is not None: if inputs_embeds is not None: # Exception 1 diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index e653fd3d2a..fb9f0a7c58 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -1256,6 +1256,9 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin) num_logits_to_keep=None, **kwargs, ): + # Overwritten -- there are mutually exclusive inputs (if the logic to make `image_hidden_states` take + # precedence is moved to the model, we can remove this fn) + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens if past_key_values is not None: if inputs_embeds is not None: # Exception 1 diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index bbc70b26d1..805c82be38 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -1160,6 +1160,7 @@ class JetMoeModel(JetMoePreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index 7674e29db6..ffd8277f02 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -1696,6 +1696,8 @@ class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel, GenerationMixin): use_cache=None, **model_kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + input_shape = input_ids.shape # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index dde017bbb9..40db21aeae 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -1053,6 +1053,7 @@ class LlamaModel(LlamaPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 411b96f5c5..31593bc62d 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -590,6 +590,8 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin): num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + # Trigger the new behavior if we have more than image embeddings seq length tokens for images legacy_processing = ( input_ids is not None diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 75dfcf5393..03ab28dfff 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -968,6 +968,8 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + legacy_processing = ( input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 30257b8439..3fd6bb47fc 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -1057,6 +1057,8 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene num_logits_to_keep=None, **kwargs, ): + # Overwritten -- extra custom processing + if input_ids is not None: img_token_not_enough = (input_ids == self.config.image_token_index).sum( 1 diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index e7de66de44..ec5a05733e 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -572,6 +572,8 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): num_logits_to_keep=None, **kwargs, ): + # Overwritten -- extra custom processing + if input_ids is not None: img_token_not_enough = (input_ids == self.config.image_token_index).sum( 1 diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 3eefb517b1..7bacd2a54f 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -728,6 +728,8 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 1bb7a3f109..3f26b5fe03 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -1129,78 +1129,6 @@ class MistralForCausalLM(MistralPreTrainedModel, GenerationMixin): attentions=outputs.attentions, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - num_logits_to_keep=None, - **kwargs, - ): - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # `contiguous()` needed for compilation use cases - model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } - ) - return model_inputs - @add_start_docstrings( """ diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 9bb0654f03..9248ad2187 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1367,76 +1367,6 @@ class MixtralForCausalLM(MixtralPreTrainedModel, GenerationMixin): router_logits=outputs.router_logits, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - output_router_logits=False, - position_ids=None, - use_cache=True, - num_logits_to_keep=None, - **kwargs, - ): - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - "output_router_logits": output_router_logits, - } - ) - return model_inputs - @add_start_docstrings( """ diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index e486e149e3..b8d2879612 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -1785,6 +1785,7 @@ class MllamaTextModel(MllamaPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -2171,6 +2172,8 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin): num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 5746a5934b..97200b7d04 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -1400,90 +1400,6 @@ class MoshiDepthDecoder(MoshiPreTrainedModel, GenerationMixin): ) return causal_mask - def prepare_inputs_for_generation( - self, - input_ids: torch.LongTensor, - past_key_values: Optional[Cache] = None, - attention_mask: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - cache_position: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - use_cache: bool = True, - num_logits_to_keep: Optional[int] = None, - **kwargs, - ): - """ - Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or - slicing inputs given the existing cache. - See the documentation in the used model for the arguments (different models might have different requirements - for e.g. `past_key_values`). Should work as is for most LLMs. - """ - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s - # `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the - # decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, - # `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # The clone here is for the same reason as for `position_ids`. - model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create - # the 4D causal mask exists, it should be present in the base model (XXXModel class). - attention_mask = self._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.max_cache_len, - dtype=self.text_embed_tokens.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - "last_hidden_state": kwargs.get("last_hidden_state"), - } - ) - return model_inputs - @add_start_docstrings( "The bare Moshi Model outputting raw hidden-states without any specific head on top.", @@ -2492,66 +2408,18 @@ class MoshiForConditionalGeneration(MoshiPreTrainedModel, GenerationMixin): blank_user_audio_codes: Optional[torch.FloatTensor] = None, **kwargs, ): + # Overwritten -- Moshi has custom post-processing # 1. Do usual operations done on LLMs like Gemma - because we pre-processed inputs, the first pass always has inputs_embeds - - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # The clone here is for the same reason as for `position_ids`. - model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - dtype = self.decoder.dtype - - attention_mask = self.decoder.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.max_cache_len, - dtype=dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } + model_inputs = super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + num_logits_to_keep=num_logits_to_keep, + **kwargs, ) # 2. Now that everything is prepared, generate audio_codes using the depth decoder diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index 8d7f6ad3c7..626097f5c7 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -1345,6 +1345,7 @@ class MusicgenForCausalLM(MusicgenPreTrainedModel, GenerationMixin): guidance_scale=None, **kwargs, ): + # Overwritten -- MusicGen has custom processing if delay_pattern_mask is None: input_ids, delay_pattern_mask = self.build_delay_pattern_mask( input_ids, @@ -2180,6 +2181,7 @@ class MusicgenForConditionalGeneration(PreTrainedModel, GenerationMixin): guidance_scale=None, **kwargs, ): + # Overwritten -- MusicGen has custom processing if decoder_delay_pattern_mask is None: decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask( decoder_input_ids, diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index 96b8d29db8..166623796d 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -1254,6 +1254,7 @@ class MusicgenMelodyForCausalLM(MusicgenMelodyPreTrainedModel, GenerationMixin): guidance_scale=None, **kwargs, ): + # Overwritten -- MusicGen has custom processing if delay_pattern_mask is None: input_ids, delay_pattern_mask = self.build_delay_pattern_mask( input_ids, @@ -2058,6 +2059,7 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel, GenerationMixin): guidance_scale=None, **kwargs, ): + # Overwritten -- MusicGen has custom processing if decoder_delay_pattern_mask is None: decoder_input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask( decoder_input_ids, diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index 7d0390adc3..aa2fd93fbe 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -931,6 +931,7 @@ class NemotronModel(NemotronPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 7ab54146c9..ff45fffb6e 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -973,6 +973,7 @@ class OlmoModel(OlmoPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index 8c29f89ff3..d9d4a9771c 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -1131,6 +1131,7 @@ class OlmoeModel(OlmoePreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index d75a05bda0..0eb2d50e0a 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -59,6 +59,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( batch_size: int, is_training: bool = False, token_type_ids: torch.Tensor = None, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -572,6 +573,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi num_logits_to_keep=None, **kwargs, ): + # Overwritten -- custom `position_ids` and `pixel_values` handling model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -581,33 +583,10 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi cache_position=cache_position, use_cache=use_cache, num_logits_to_keep=num_logits_to_keep, + token_type_ids=token_type_ids, **kwargs, ) - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - dtype = self.get_output_embeddings().weight.dtype - min_dtype = torch.finfo(dtype).min - - model_inputs["attention_mask"] = _prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_length(), - dtype=dtype, - device=device, - min_dtype=min_dtype, - cache_position=cache_position, - batch_size=batch_size, - ) - - model_inputs["token_type_ids"] = token_type_ids - # position_ids in Paligemma are 1-indexed if model_inputs.get("position_ids") is not None: model_inputs["position_ids"] += 1 diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index 7ae3469a4c..61d8b1002f 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -800,6 +800,7 @@ class PersimmonModel(PersimmonPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 3f770c9ec0..807b3fef4f 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -1091,6 +1091,7 @@ class PhiModel(PhiPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 0380c6cd49..a0b4b2ec37 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -1351,63 +1351,16 @@ class Phi3ForCausalLM(Phi3PreTrainedModel, GenerationMixin): if past_length <= self.config.original_max_position_embeddings: past_key_values = None - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # The clone here is for the same reason as for `position_ids`. - model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } + model_inputs = super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + num_logits_to_keep=num_logits_to_keep, + **kwargs, ) return model_inputs diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index d1705f04dd..1da65d7d39 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -1541,63 +1541,16 @@ class PhimoeForCausalLM(PhimoePreTrainedModel, GenerationMixin): if past_length <= self.config.original_max_position_embeddings: past_key_values = None - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # The clone here is for the same reason as for `position_ids`. - model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } + model_inputs = super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + num_logits_to_keep=num_logits_to_keep, + **kwargs, ) return model_inputs diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index f209d7d882..37090677a6 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -1739,46 +1739,3 @@ class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel, GenerationMi encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) - - def prepare_inputs_for_generation( - self, - input_ids, - flattened_patches: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - decoder_attention_mask: Optional[torch.BoolTensor] = None, - past_key_values=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - if decoder_attention_mask is None: - decoder_attention_mask = torch.ones_like(input_ids).to(input_ids.device) - - # cut decoder_input_ids if past_key_values is used - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - return { - "flattened_patches": flattened_patches, - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "decoder_attention_mask": decoder_attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - } diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py index e6488898e8..d6f92e9fe0 100644 --- a/src/transformers/models/pop2piano/modeling_pop2piano.py +++ b/src/transformers/models/pop2piano/modeling_pop2piano.py @@ -1299,33 +1299,6 @@ class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel, GenerationMixi **kwargs, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - input_ids = input_ids[:, -1:] - - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return self._shift_right(labels) diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 2585352fc9..2e59ebd5eb 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -1229,79 +1229,6 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel, GenerationMixin): attentions=outputs.attentions, ) - # Copied from transformers.models.mistral.modeling_mistral.MistralForCausalLM.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - num_logits_to_keep=None, - **kwargs, - ): - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # `contiguous()` needed for compilation use cases - model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } - ) - return model_inputs - @add_start_docstrings( """ diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py index 6422baac5f..e923e535da 100644 --- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py @@ -1253,16 +1253,17 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi attention_mask=attention_mask, ) - # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.prepare_inputs_for_generation with image->audio def prepare_inputs_for_generation( self, input_ids, past_key_values=None, inputs_embeds=None, - input_features=None, # Ignore copy + input_features=None, attention_mask=None, **kwargs, ): + # Overwritten -- custom processing (note: might not be needed, but there are no generation tests running atm) + if past_key_values is not None: if isinstance(past_key_values, Cache): cache_length = past_key_values.get_seq_length() @@ -1270,7 +1271,6 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi else: cache_length = past_length = past_key_values[0][0].shape[2] - # Ignore copy # Here, we get the attention_mask, which was previously stored in the state after _merge_input_ids_with_audio_features. if input_features is not None and kwargs.get("attention_mask") is not None: attention_mask = kwargs["attention_mask"] @@ -1310,7 +1310,6 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi else: model_inputs = {"input_ids": input_ids} - # Ignore copy feature_attention_mask = kwargs.get("feature_attention_mask", None) model_inputs.update( { diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 1a5f6e2ff2..1e741b4a9e 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1433,79 +1433,6 @@ class Qwen2MoeForCausalLM(Qwen2MoePreTrainedModel, GenerationMixin): router_logits=outputs.router_logits, ) - # Copied from transformers.models.mistral.modeling_mistral.MistralForCausalLM.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - num_logits_to_keep=None, - **kwargs, - ): - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # `contiguous()` needed for compilation use cases - model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } - ) - return model_inputs - @add_start_docstrings( """ diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index e014a6da6b..5464b40546 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1803,6 +1803,8 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin): video_grid_thw=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py index adc01ec40f..c5c3b20284 100755 --- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py @@ -2264,28 +2264,6 @@ class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel, encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.t2u_pad_token_id, self.config.t2u_decoder_start_token_id) @@ -2917,28 +2895,6 @@ class SeamlessM4TForTextToText(SeamlessM4TPreTrainedModel, GenerationMixin): **kwargs, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () @@ -3209,28 +3165,6 @@ class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel): **kwargs, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () @@ -3560,28 +3494,6 @@ class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel): return waveform, waveform_lengths - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () @@ -3931,28 +3843,6 @@ class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel): ) return reordered_past - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @add_start_docstrings( "The original SeamlessM4T Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).", @@ -4385,28 +4275,6 @@ class SeamlessM4TModel(SeamlessM4TPreTrainedModel): return waveform, waveform_lengths - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py index 21265faa22..a8068eb0ad 100644 --- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py @@ -3175,28 +3175,6 @@ class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin): **kwargs, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () @@ -3477,29 +3455,6 @@ class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel): **kwargs, ) - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText._reorder_cache def _reorder_cache(past_key_values, beam_idx): @@ -3871,29 +3826,6 @@ class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel): return waveform, waveform_lengths - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech._reorder_cache def _reorder_cache(past_key_values, beam_idx): @@ -4285,29 +4217,6 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel): ) return reordered_past - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @add_start_docstrings( "The original SeamlessM4Tv2 Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).", @@ -4786,29 +4695,6 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel): return waveform, waveform_lengths - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "input_ids": None, # encoder_outputs is defined. input_ids not needed - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "use_cache": use_cache, - } - @staticmethod # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel._reorder_cache def _reorder_cache(past_key_values, beam_idx): diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index ef84a4fa5f..a1caa7cf6d 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -21,6 +21,7 @@ from torch import nn from torch.nn import CrossEntropyLoss from ...configuration_utils import PretrainedConfig +from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput from ...modeling_utils import PreTrainedModel from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings @@ -169,7 +170,7 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start @add_start_docstrings(SPEECH_ENCODER_DECODER_START_DOCSTRING) -class SpeechEncoderDecoderModel(PreTrainedModel): +class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin): r""" [`SpeechEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one of the base model classes of the library as encoder and another one as decoder when created with the @@ -574,20 +575,6 @@ class SpeechEncoderDecoderModel(PreTrainedModel): def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs - ): - decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values) - input_dict = { - "attention_mask": attention_mask, - "decoder_attention_mask": decoder_inputs.get("attention_mask"), - "decoder_input_ids": decoder_inputs["input_ids"], - "encoder_outputs": encoder_outputs, - "past_key_values": decoder_inputs.get("past_key_values"), - "use_cache": use_cache, - } - return input_dict - def resize_token_embeddings(self, *args, **kwargs): raise NotImplementedError( "Resizing the embedding layers via the SpeechEncoderDecoderModel directly is not supported. Please use the" diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index bdd532fa25..aadc1da500 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1331,33 +1331,6 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel, Generation encoder_attentions=outputs.encoder_attentions, ) - def prepare_inputs_for_generation( - self, - decoder_input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - decoder_input_ids = decoder_input_ids[:, -1:] - - return { - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "decoder_input_ids": decoder_input_ids, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, # change this to avoid caching (presumably for debugging) - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index dbe57c01d9..63b536d185 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -2425,6 +2425,8 @@ class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel): encoder_outputs=None, **kwargs, ): + # Note that this model doesn't inherit from the generation mixin, has unique generate function + # cut decoder_input_ids if past is used if past_key_values is not None: past_length = past_key_values[0][0].shape[2] diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index fe3ad64981..9b445596f6 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -1075,6 +1075,7 @@ class StableLmModel(StableLmPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index e0fdbef1a3..66d36d6db7 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -1204,79 +1204,6 @@ class Starcoder2ForCausalLM(Starcoder2PreTrainedModel, GenerationMixin): attentions=outputs.attentions, ) - # Copied from transformers.models.mistral.modeling_mistral.MistralForCausalLM.prepare_inputs_for_generation - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - num_logits_to_keep=None, - **kwargs, - ): - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - if past_key_values is not None: - if inputs_embeds is not None: # Exception 1 - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. - position_ids = position_ids.clone(memory_format=torch.contiguous_format) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache_position[0] == 0: - model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} - else: - # `contiguous()` needed for compilation use cases - model_inputs = {"input_ids": input_ids.contiguous(), "inputs_embeds": None} - - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_cache_shape(), - dtype=self.lm_head.weight.dtype, - device=device, - cache_position=cache_position, - batch_size=batch_size, - config=self.config, - past_key_values=past_key_values, - ) - - if num_logits_to_keep is not None: - model_inputs["num_logits_to_keep"] = num_logits_to_keep - - model_inputs.update( - { - "position_ids": position_ids, - "cache_position": cache_position, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - } - ) - return model_inputs - @add_start_docstrings( """ diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index c621b74232..6be8752d5b 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -1869,36 +1869,6 @@ class UdopForConditionalGeneration(UdopPreTrainedModel, GenerationMixin): encoder_attentions=encoder_outputs.attentions, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - head_mask=None, - decoder_head_mask=None, - cross_attn_head_mask=None, - use_cache=None, - encoder_outputs=None, - **kwargs, - ): - # cut decoder_input_ids if past is used - if past_key_values is not None: - input_ids = input_ids[:, -1:] - - return { - "decoder_input_ids": input_ids, - "past_key_values": past_key_values, - "encoder_outputs": encoder_outputs, - "attention_mask": attention_mask, - "head_mask": head_mask, - "decoder_head_mask": decoder_head_mask, - "cross_attn_head_mask": cross_attn_head_mask, - "use_cache": use_cache, - "bbox": kwargs.get("bbox", None), - "pixel_values": kwargs.get("pixel_values", None), - "visual_bbox": kwargs.get("visual_bbox", None), - } - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration._reorder_cache def _reorder_cache(self, past_key_values, beam_idx): # if decoder past is not included in output diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 20fa0166b8..c9703d263e 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -707,6 +707,8 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + if input_ids is not None: img_token_not_enough = (input_ids == self.config.image_token_index).sum( 1 diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 7634822847..3af32a9caa 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -581,6 +581,8 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel, GenerationMixin) num_logits_to_keep=None, **kwargs, ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + # Trigger the new behavior if we have more than image embeddings seq length tokens for images legacy_processing = ( input_ids is not None diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py index 0c3cd95adb..b044dda300 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py @@ -24,6 +24,7 @@ from torch import nn from torch.nn import CrossEntropyLoss from ...configuration_utils import PretrainedConfig +from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput from ...modeling_utils import PreTrainedModel from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings @@ -147,7 +148,7 @@ VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r""" @add_start_docstrings(VISION_ENCODER_DECODER_START_DOCSTRING) -class VisionEncoderDecoderModel(PreTrainedModel): +class VisionEncoderDecoderModel(PreTrainedModel, GenerationMixin): r""" [`VisionEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one of the base vision model classes of the library as encoder and another one as decoder when created with the @@ -654,20 +655,6 @@ class VisionEncoderDecoderModel(PreTrainedModel): def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs - ): - decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values) - input_dict = { - "attention_mask": attention_mask, - "decoder_attention_mask": decoder_inputs.get("attention_mask"), - "decoder_input_ids": decoder_inputs["input_ids"], - "encoder_outputs": encoder_outputs, - "past_key_values": decoder_inputs.get("past_key_values"), - "use_cache": use_cache, - } - return input_dict - def resize_token_embeddings(self, *args, **kwargs): raise NotImplementedError( "Resizing the embedding layers via the VisionEncoderDecoderModel directly is not supported.Please use the" diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 079965fc17..ce3df3e167 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -1442,6 +1442,7 @@ class WhisperDecoder(WhisperPreTrainedModel): device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -1817,6 +1818,10 @@ class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedM cache_position=None, **kwargs, ): + # Overwritten -- encoder-decoder whisper has custom logic, but it's close to the general function. Next time + # this function needs to be touched, let's try to sort out the commonalities between the two and remove the + # overwrite. + decoder_position_ids = None if decoder_attention_mask is not None: decoder_position_ids = (decoder_attention_mask.cumsum(-1) - 1).clamp(min=0) @@ -2092,46 +2097,6 @@ class WhisperForCausalLM(WhisperPreTrainedModel, GenerationMixin): cross_attentions=outputs.cross_attentions, ) - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - use_cache=None, - encoder_outputs=None, - attention_mask=None, - cache_position=None, - **kwargs, - ): - past_length = 0 - if past_key_values is not None: - if isinstance(past_key_values, (Cache, EncoderDecoderCache)): - past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length() - else: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - if cache_position is None: - cache_position = torch.arange(past_length, past_length + input_ids.shape[1], device=input_ids.device) - elif use_cache: - cache_position = cache_position[-input_ids.shape[1] :] - - return { - "encoder_outputs": encoder_outputs, - "past_key_values": past_key_values, - "input_ids": input_ids, - "use_cache": use_cache, - "attention_mask": attention_mask, - "cache_position": cache_position, - } - @staticmethod def _reorder_cache(past_key_values, beam_idx): reordered_past = () diff --git a/utils/check_copies.py b/utils/check_copies.py index 4bb5c6fef4..ed6a4f68b5 100644 --- a/utils/check_copies.py +++ b/utils/check_copies.py @@ -672,9 +672,13 @@ def is_copy_consistent(filename: str, overwrite: bool = False, buffer: dict = No indent, object_name, replace_pattern = search.groups() # Find the file lines, the object's code, and its blocks - target_lines, theoretical_code, theoretical_code_splits = find_code_and_splits( - object_name, base_path, buffer=buffer - ) + try: + target_lines, theoretical_code, theoretical_code_splits = find_code_and_splits( + object_name, base_path, buffer=buffer + ) + except Exception as exc: + exc.args = (f"Error while trying to find source code for {filename}.\n\n" + str(exc),) + raise # code replaced by the patterns theoretical_code_blocks = OrderedDict()