diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py index 981d40bb6d..19b059699e 100644 --- a/examples/modular-transformers/modeling_my_new_model2.py +++ b/examples/modular-transformers/modeling_my_new_model2.py @@ -333,12 +333,6 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -433,12 +427,6 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py index 21aabae4d3..9111883cfe 100644 --- a/examples/modular-transformers/modeling_new_task_model.py +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -389,12 +389,6 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/examples/modular-transformers/modeling_super.py b/examples/modular-transformers/modeling_super.py index c44f12f02f..fc90cce75a 100644 --- a/examples/modular-transformers/modeling_super.py +++ b/examples/modular-transformers/modeling_super.py @@ -332,12 +332,6 @@ class SuperModel(SuperPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index e20f4c0fe7..f6d0a85857 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1902,7 +1902,97 @@ class ModuleUtilsMixin: return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings) -class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMixin): +class EmbeddingAccessMixin: + """ + Base utilities to regroup getters and setters for embeddings. + Introduces the `input_layer_embed` attribute, which indicates + where the input embeddings come from and where they + should be set. + """ + + _input_embed_layer = "embed_tokens" # default layer that holds input embeddings. + + def get_input_embeddings(self) -> nn.Module: + """ + Returns the model's input embeddings. + + Returns: + `nn.Module`: A torch module mapping vocabulary to hidden states. + """ + + # 1) Check if the model has an attribute named 'embed_tokens' (the standard input embedding layer + # for most NLP models), and if so, return it. + + name = getattr(self, "_input_embed_layer", "embed_tokens") + + if (default_embedding := getattr(self, name, None)) is not None: + return default_embedding + # 2) encoder/decoder and VLMs like `Gemma3nForConditionalGeneration` + + if hasattr(self, "model") and hasattr(self.model, "embed_tokens"): + return self.model.embed_tokens + + # 3) vanilla decoder‑only architectures + elif hasattr(self, "embed_tokens"): + return self.embed_tokens + else: + base_model = getattr(self, "base_model_prefix", None) + if base_model is not None: + base_model = getattr(self, base_model, None) + if base_model is not None and base_model is not self: + return base_model.get_input_embeddings() + raise NotImplementedError( + f"`get_input_embeddings` not auto‑handled for {self.__class__.__name__}; " + "please override in the subclass." + ) + + def set_input_embeddings(self, value: nn.Module): + """Fallback setter that handles **~70 %** of models in the code‑base. + + Order of attempts: + 1. `self.model.embed_tokens` + 2. `self.embed_tokens` + 3. delegate to the *base model* if one exists + 4. otherwise raise `NotImplementedError` so subclasses still can (and + should) override for exotic layouts. + """ + + # 1) encoder/decoder and VLMs like `Gemma3nForConditionalGeneration` + name = getattr(self, "_input_embed_layer", "embed_tokens") + if hasattr(self, "model") and hasattr(self.model, name): + setattr(self.model, name, value) + # 2) as well as vanilla decoder‑only architectures + elif hasattr(self, name): + setattr(self, name, value) + # 3) recurse once into the registered *base* model (e.g. for encoder/decoder) + elif getattr(self, self.base_model_prefix, self) is not self: + base_model = getattr(self, self.base_model_prefix, self) + base_model.set_input_embeddings(value) + else: + raise NotImplementedError( + f"`set_input_embeddings` not auto‑handled for {self.__class__.__name__}; please override in the subclass." + ) + + def get_output_embeddings(self): + if not hasattr(self, "lm_head"): + return None + try: + # Speech / vision backbones raise here, so we return None. + # Legit use of get_input_embs? + self.get_input_embeddings() + except NotImplementedError: + return None + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + """ + Sets the model's output embedding, defaulting to setting new_embeddings to lm_head. + """ + if getattr(self, "lm_head"): + self.lm_head = new_embeddings + + +class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMixin): r""" Base class for all models. @@ -2004,6 +2094,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi _supports_attention_backend = False _can_record_outputs = None + # This attribute sets the default parameter to be + @property @torch._dynamo.allow_in_graph def can_record_outputs(self) -> dict[str, OutputRecorder]: @@ -2267,6 +2359,101 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi return model + @classmethod + def _check_attn_implementation(cls, attn_implementation: Union[str, dict]) -> Union[str, dict]: + """ + Checks that the requested attention implementation exists and tries to get the kernel from hub + if `attn_implementation` matches hf kernels pattern. + """ + if isinstance(attn_implementation, str) and re.match(r"^[^/:]+/[^/:]+:[^/:]+$", attn_implementation): + if not is_kernels_available(): + raise ValueError("kernels is not installed. Please install it with `pip install kernels`.") + + # Extract repo_id and kernel_name from the string + repo_id, kernel_name = attn_implementation.split(":") + kernel_name = kernel_name.strip() + repo_id = repo_id.strip() + + try: + kernel = get_kernel(repo_id) + ALL_ATTENTION_FUNCTIONS.register(f"kernel_{repo_id.replace('/', '_')}", getattr(kernel, kernel_name)) + attn_implementation = f"kernel_{repo_id.replace('/', '_')}" + except FileNotFoundError as e: + logger.warning( + f"Could not find a kernel repository '{repo_id}' compatible with your devicein the hub: {e}. Using eager attention implementation instead." + ) + attn_implementation = None # try to dispatch SDPA and fallback eager if not available + except AttributeError: + raise ValueError( + "the kernel function name or class specified in the attn_implementation argument is not valid. \ + Please check the documentation for the correct format, \ + and check that the kernel exports the class and the function correctly." + ) + if ( + not isinstance(attn_implementation, dict) + and attn_implementation not in ["eager", None] + ALL_ATTENTION_FUNCTIONS.valid_keys() + ): + message = f'Specified `attn_implementation="{attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)' + # check `supports_flash_attn_2` for BC with custom code. TODO: remove after a few releases + if cls._supports_flash_attn or getattr(cls, "_supports_flash_attn_2", False): + message += ( + ', `"attn_implementation=flash_attention_3"` (implementation using flash attention 3)' + ', `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)' + ) + if cls._supports_sdpa: + message += ', `"attn_implementation=sdpa"` (implementation using torch.nn.functional.scaled_dot_product_attention)' + if cls._supports_flex_attn: + message += ', `"attn_implementation=flex_attention"` (implementation using torch\'s flex_attention)' + raise ValueError(message + ".") + + return attn_implementation + + def set_attention_implementation(self, attn_implementation: Union[str, dict]): + """ + Checks and dispatches to the requested attention implementation. + """ + requested_attn_implementation = self._check_attn_implementation(attn_implementation) + + # Composite models consisting of several PretrainedModels can specify attention implementation as a dict where + # keys are sub-config names. But most people will specify one `str` which means that should dispatch it for all sub-models. + # See https://github.com/huggingface/transformers/pull/32238 + for key in self.config.sub_configs.keys(): + sub_config = getattr(self.config, key) + curr_attn_implementation = ( + requested_attn_implementation + if not isinstance(requested_attn_implementation, dict) + else requested_attn_implementation.get(key, None) + ) + # For models with backbone sub-config might be not initialized. Set the requested att + # if the config hasn't got any attn pre-set and the requested attn in not `None` (i.e not the default attn) + if ( + sub_config is not None + and sub_config._attn_implementation_internal is None + and curr_attn_implementation is not None + ): + sub_config._attn_implementation_internal = curr_attn_implementation + + if requested_attn_implementation == "flash_attention_3" and self._flash_attn_3_can_dispatch(): + self.config._attn_implementation = "flash_attention_3" + if requested_attn_implementation == "flash_attention_2" and self._flash_attn_2_can_dispatch(): + self.config._attn_implementation = "flash_attention_2" + elif requested_attn_implementation == "flex_attention" and self._flex_attn_can_dispatch(): + self.config._attn_implementation = "flex_attention" + elif ( + requested_attn_implementation in [None, "sdpa"] + and not is_torch_xla_available() + and self._sdpa_can_dispatch(hard_check_only=requested_attn_implementation is not None) + ): + self.config._attn_implementation = "sdpa" + elif requested_attn_implementation in ALL_ATTENTION_FUNCTIONS.valid_keys(): + self.config._attn_implementation = requested_attn_implementation + elif isinstance(requested_attn_implementation, dict): + self.config._attn_implementation = requested_attn_implementation.get("", None) + else: + self.config._attn_implementation = "eager" + + self.config._attn_implementation_autoset = True + @classmethod def _set_default_torch_dtype(cls, dtype: torch.dtype) -> torch.dtype: """ @@ -2769,41 +2956,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi """ self._require_grads_hook.remove() - def get_input_embeddings(self) -> nn.Module: - """ - Returns the model's input embeddings. - - Returns: - `nn.Module`: A torch module mapping vocabulary to hidden states. - """ - base_model = getattr(self, self.base_model_prefix, self) - if base_model is not self: - return base_model.get_input_embeddings() - else: - raise NotImplementedError - - def set_input_embeddings(self, value: nn.Module): - """ - Set model's input embeddings. - - Args: - value (`nn.Module`): A module mapping vocabulary to hidden states. - """ - base_model = getattr(self, self.base_model_prefix, self) - if base_model is not self: - base_model.set_input_embeddings(value) - else: - raise NotImplementedError - - def get_output_embeddings(self) -> nn.Module: - """ - Returns the model's output embeddings. - - Returns: - `nn.Module`: A torch module mapping hidden states to vocabulary. - """ - return None # Overwrite for models with output embeddings - def _init_weights(self, module): """ Initialize the weights. This method should be overridden by derived class and is diff --git a/src/transformers/models/arcee/modeling_arcee.py b/src/transformers/models/arcee/modeling_arcee.py index 99a763b1e0..43a02ebd8c 100644 --- a/src/transformers/models/arcee/modeling_arcee.py +++ b/src/transformers/models/arcee/modeling_arcee.py @@ -356,12 +356,6 @@ class ArceeModel(ArceePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -438,18 +432,6 @@ class ArceeForCausalLM(ArceePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -533,12 +515,6 @@ class ArceeForSequenceClassification(ArceePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -685,12 +661,6 @@ class ArceeForTokenClassification(ArceePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index 24e741f879..6c5c972b1f 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -740,12 +740,6 @@ class AriaTextModel(AriaTextPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -822,18 +816,6 @@ class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -1146,9 +1128,6 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin): def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py index 5692819b3e..cb48470687 100644 --- a/src/transformers/models/aya_vision/modeling_aya_vision.py +++ b/src/transformers/models/aya_vision/modeling_aya_vision.py @@ -368,9 +368,6 @@ class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixi def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py index fa5dda36cd..3548e706a5 100644 --- a/src/transformers/models/bamba/modeling_bamba.py +++ b/src/transformers/models/bamba/modeling_bamba.py @@ -1126,12 +1126,6 @@ class BambaModel(BambaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1376,18 +1370,6 @@ class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/bamba/modular_bamba.py b/src/transformers/models/bamba/modular_bamba.py index 1671007b00..9bfbfd159f 100644 --- a/src/transformers/models/bamba/modular_bamba.py +++ b/src/transformers/models/bamba/modular_bamba.py @@ -854,12 +854,6 @@ class BambaModel(BambaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index 5dd03770ec..2e199d4fdd 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -397,6 +397,11 @@ class BarkCausalModel(BarkPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() + def get_output_embeddings(self): + # NOTE: get_output_embeddings() must return None to prevent accidental weight tying. + # See e.g. https://github.com/huggingface/transformers/pull/39339#discussion_r2219126400 + return None + def get_input_embeddings(self): return self.input_embeds_layer diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 4584373241..055b696405 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -764,12 +764,6 @@ class BartEncoder(BartPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -930,12 +924,6 @@ class BartDecoder(BartPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -1369,12 +1357,6 @@ class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin): new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def _tie_weights(self): if self.config.tie_word_embeddings: self.model._tie_weights() @@ -1857,12 +1839,6 @@ class BartForCausalLM(BartPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.decoder = decoder diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index 9138e7b840..01d7a0e826 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -888,6 +888,12 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel): # Initialize weights and apply final processing self.post_init() + def get_output_embeddings(self): + # NOTE: get_output_embeddings() must return None to prevent accidental weight tying. + # Vision models like BEiT use a Conv2d patch embed layer (no `.weight`) so calling tie_weights() would fail. + # See e.g. https://github.com/huggingface/transformers/pull/39339#discussion_r2219126400 + return None + @auto_docstring def forward( self, diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 8146d7b018..d567808f95 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -2084,12 +2084,6 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -2506,12 +2500,6 @@ class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, Gene new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def _tie_weights(self): if self.config.tie_word_embeddings: self.model._tie_weights() @@ -2938,12 +2926,6 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.decoder = decoder diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py index c1575b0c04..a873cd6b69 100755 --- a/src/transformers/models/biogpt/modeling_biogpt.py +++ b/src/transformers/models/biogpt/modeling_biogpt.py @@ -522,12 +522,6 @@ class BioGptModel(BioGptPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, diff --git a/src/transformers/models/biogpt/modular_biogpt.py b/src/transformers/models/biogpt/modular_biogpt.py index 44ccac314d..3f63caddea 100644 --- a/src/transformers/models/biogpt/modular_biogpt.py +++ b/src/transformers/models/biogpt/modular_biogpt.py @@ -347,12 +347,6 @@ class BioGptModel(BioGptPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, diff --git a/src/transformers/models/bitnet/modeling_bitnet.py b/src/transformers/models/bitnet/modeling_bitnet.py index 66cf5d02f4..c373d659c7 100644 --- a/src/transformers/models/bitnet/modeling_bitnet.py +++ b/src/transformers/models/bitnet/modeling_bitnet.py @@ -351,12 +351,6 @@ class BitNetModel(BitNetPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -433,18 +427,6 @@ class BitNetForCausalLM(BitNetPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index 3b9467e6e2..65f0378ef5 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -879,12 +879,6 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids=None, @@ -1329,12 +1323,6 @@ class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel, GenerationMi new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, @@ -1503,12 +1491,6 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.decoder = decoder diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index 7f7701d9da..9030bd1e5c 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -866,12 +866,6 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids=None, @@ -1288,12 +1282,6 @@ class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel, Ge new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, @@ -1462,12 +1450,6 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel, GenerationMixin def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.decoder = decoder diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index 7da3e9de9e..f999872bef 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -759,9 +759,6 @@ class BloomForCausalLM(BloomPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - def set_output_embeddings(self, new_embeddings: torch.Tensor): self.lm_head = new_embeddings diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 9516f8b496..5d8e6fc210 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -886,12 +886,6 @@ class ChameleonModel(ChameleonPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def get_image_tokens(self, pixel_values: torch.FloatTensor): """ Tokenizes images into discrete tokens with VQGAN module. Converts @@ -1181,18 +1175,6 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index c705997c20..b0170e6402 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -1259,6 +1259,11 @@ class ClvpForCausalLM(ClvpPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() + def get_output_embeddings(self): + # NOTE: get_output_embeddings() must return None to prevent accidental weight tying. + # See e.g. https://github.com/huggingface/transformers/pull/39339#discussion_r2219126400 + return None + def get_input_embeddings(self): return self.model.decoder.input_embeds_layer diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index 29798287dd..b1378f5517 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -592,12 +592,6 @@ class CodeGenForCausalLM(CodeGenPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 0180161e13..1fb91bccaa 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -384,12 +384,6 @@ class CohereModel(CoherePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -468,18 +462,6 @@ class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/cohere2/modeling_cohere2.py b/src/transformers/models/cohere2/modeling_cohere2.py index a051551690..f3dc518f92 100644 --- a/src/transformers/models/cohere2/modeling_cohere2.py +++ b/src/transformers/models/cohere2/modeling_cohere2.py @@ -361,12 +361,6 @@ class Cohere2Model(Cohere2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -449,18 +443,6 @@ class Cohere2ForCausalLM(Cohere2PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py index e1b1c33276..91dba7da6c 100755 --- a/src/transformers/models/cpmant/modeling_cpmant.py +++ b/src/transformers/models/cpmant/modeling_cpmant.py @@ -800,11 +800,12 @@ class CpmAntForCausalLM(CpmAntPreTrainedModel, GenerationMixin): def set_input_embeddings(self, embeddings): self.cpmant.input_embedding = embeddings - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings + def _reorder_cache(self, past_key_values, beam_idx): + past_key_values = [list(each) if each is not None else each for each in past_key_values] + for key_value_layer in past_key_values: + key_value_layer[0] = key_value_layer[0][beam_idx] + key_value_layer[1] = key_value_layer[1][beam_idx] + return past_key_values __all__ = ["CpmAntForCausalLM", "CpmAntModel", "CpmAntPreTrainedModel"] diff --git a/src/transformers/models/csm/modeling_csm.py b/src/transformers/models/csm/modeling_csm.py index 91b3ff3987..31e9ff3689 100644 --- a/src/transformers/models/csm/modeling_csm.py +++ b/src/transformers/models/csm/modeling_csm.py @@ -416,12 +416,6 @@ class CsmDepthDecoderModel(CsmPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -553,12 +547,6 @@ class CsmDepthDecoderForCausalLM(CsmPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - def set_decoder(self, decoder): self.model = decoder @@ -687,12 +675,6 @@ class CsmBackboneModel(CsmPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -793,12 +775,6 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin): def set_input_embeddings(self, value): self.backbone_model.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def _tie_weights(self): if self.config.tie_codebooks_embeddings: self._tie_or_clone_weights( diff --git a/src/transformers/models/csm/modular_csm.py b/src/transformers/models/csm/modular_csm.py index e1bc64fd6c..4701612752 100644 --- a/src/transformers/models/csm/modular_csm.py +++ b/src/transformers/models/csm/modular_csm.py @@ -296,12 +296,6 @@ class CsmDepthDecoderForCausalLM(LlamaForCausalLM, GenerationMixin): self.codebooks_head = CsmCodebooksHead(config.hidden_size, config.num_codebooks, config.vocab_size) self.model = CsmDepthDecoderModel(config) - def get_output_embeddings(self): - raise AttributeError("Not needed for Csm") - - def set_output_embeddings(self, new_embeddings): - raise AttributeError("Not needed for Csm") - def prepare_inputs_for_generation( self, input_ids: torch.LongTensor, @@ -458,12 +452,6 @@ class CsmForConditionalGeneration(CsmPreTrainedModel, CsmGenerationMixin): def set_input_embeddings(self, value): self.backbone_model.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def _tie_weights(self): if self.config.tie_codebooks_embeddings: self._tie_or_clone_weights( diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py index 675ef96592..f942cb1531 100644 --- a/src/transformers/models/ctrl/modeling_ctrl.py +++ b/src/transformers/models/ctrl/modeling_ctrl.py @@ -455,12 +455,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py b/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py index 3794a392fd..595953fd6c 100644 --- a/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py @@ -500,12 +500,6 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -582,18 +576,6 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -690,12 +672,6 @@ class DeepseekV2ForSequenceClassification(DeepseekV2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py index 9f10d63044..05171a8359 100644 --- a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py @@ -541,12 +541,6 @@ class DeepseekV3Model(DeepseekV3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -623,18 +617,6 @@ class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py index fac33b0818..ae15ffd415 100644 --- a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py +++ b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py @@ -871,9 +871,6 @@ class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel): self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -1315,12 +1312,6 @@ class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel): def set_input_embeddings(self, new_embeddings): self.model.set_input_embeddings(new_embeddings) - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def get_output_embeddings(self): - return self.lm_head - def _unpack_router_logits(self, router_outputs): total_router_logits = [] total_expert_indexes = [] diff --git a/src/transformers/models/deprecated/mega/modeling_mega.py b/src/transformers/models/deprecated/mega/modeling_mega.py index 85f314aeea..9eb1c5c1d6 100644 --- a/src/transformers/models/deprecated/mega/modeling_mega.py +++ b/src/transformers/models/deprecated/mega/modeling_mega.py @@ -1662,12 +1662,6 @@ class MegaForCausalLM(MegaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py index 66efbe1c24..abdcc2b40e 100644 --- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py +++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py @@ -544,12 +544,6 @@ class OpenLlamaModel(OpenLlamaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING) def forward( self, @@ -678,18 +672,6 @@ class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -864,12 +846,6 @@ class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING) def forward( self, diff --git a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py index aa248038f6..bf99ba6255 100755 --- a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py @@ -453,12 +453,6 @@ class Speech2Text2Decoder(Speech2Text2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids=None, @@ -697,12 +691,6 @@ class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel): def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.decoder = decoder diff --git a/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py index 9912cca22f..7b0c3aede2 100644 --- a/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py +++ b/src/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py @@ -1843,12 +1843,6 @@ class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def _tie_weights(self): if self.config.tie_word_embeddings: self._tie_or_clone_weights(self.prophetnet.word_embeddings, self.lm_head) @@ -2074,12 +2068,6 @@ class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel): def set_input_embeddings(self, value): self.prophetnet.decoder.word_embeddings = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def _tie_weights(self): if self.config.tie_word_embeddings: self._tie_or_clone_weights(self.prophetnet.decoder.word_embeddings, self.lm_head) diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py index c97319622d..92badf62f2 100644 --- a/src/transformers/models/diffllama/modeling_diffllama.py +++ b/src/transformers/models/diffllama/modeling_diffllama.py @@ -612,12 +612,6 @@ class DiffLlamaModel(DiffLlamaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -694,18 +688,6 @@ class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -797,12 +779,6 @@ class DiffLlamaForSequenceClassification(DiffLlamaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -949,12 +925,6 @@ class DiffLlamaForTokenClassification(DiffLlamaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/doge/modeling_doge.py b/src/transformers/models/doge/modeling_doge.py index 63813fe3d1..21b2794c03 100644 --- a/src/transformers/models/doge/modeling_doge.py +++ b/src/transformers/models/doge/modeling_doge.py @@ -544,12 +544,6 @@ class DogeModel(DogePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -739,18 +733,6 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -867,12 +849,6 @@ class DogeForSequenceClassification(DogePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/dots1/modeling_dots1.py b/src/transformers/models/dots1/modeling_dots1.py index 6a84d77a05..06df98b835 100644 --- a/src/transformers/models/dots1/modeling_dots1.py +++ b/src/transformers/models/dots1/modeling_dots1.py @@ -460,12 +460,6 @@ class Dots1Model(Dots1PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -555,18 +549,6 @@ class Dots1ForCausalLM(Dots1PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index b66e244234..fd7cbf39e1 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -1174,12 +1174,6 @@ class Emu3TextModel(Emu3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -1257,18 +1251,6 @@ class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -1499,9 +1481,6 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin): def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py index e32cfd2dfc..580bf670e3 100644 --- a/src/transformers/models/emu3/modular_emu3.py +++ b/src/transformers/models/emu3/modular_emu3.py @@ -1063,9 +1063,6 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin): def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index d7fd324285..71772e4ffa 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -764,6 +764,11 @@ class EsmPreTrainedModel(PreTrainedModel): elif isinstance(module, EsmLMHead): module.bias.data.zero_() + def get_output_embeddings(self): + # NOTE: get_output_embeddings() must return None to prevent accidental weight tying. + # See e.g. https://github.com/huggingface/transformers/pull/39339#discussion_r2219126400 + return None + @auto_docstring class EsmModel(EsmPreTrainedModel): diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 392472d537..4033e2c14d 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -1002,9 +1002,6 @@ class FalconForCausalLM(FalconPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - def set_output_embeddings(self, new_embeddings: torch.Tensor): self.lm_head = new_embeddings diff --git a/src/transformers/models/falcon_h1/modeling_falcon_h1.py b/src/transformers/models/falcon_h1/modeling_falcon_h1.py index 3c6a8e500e..33955d3a6b 100644 --- a/src/transformers/models/falcon_h1/modeling_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modeling_falcon_h1.py @@ -1238,12 +1238,6 @@ class FalconH1Model(FalconH1PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1484,18 +1478,6 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/falcon_h1/modular_falcon_h1.py b/src/transformers/models/falcon_h1/modular_falcon_h1.py index 305fd7bfbb..47e958eb0c 100644 --- a/src/transformers/models/falcon_h1/modular_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modular_falcon_h1.py @@ -1016,12 +1016,6 @@ class FalconH1Model(FalconH1PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 56a5770ba7..0a60b2b6bf 100644 --- a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -776,12 +776,6 @@ class FalconMambaForCausalLM(FalconMambaPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_input_embeddings(self): return self.backbone.get_input_embeddings() diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py index b7a4cb9a05..ff163aacd4 100644 --- a/src/transformers/models/fuyu/modeling_fuyu.py +++ b/src/transformers/models/fuyu/modeling_fuyu.py @@ -254,12 +254,6 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 2aab29381a..287c9b3013 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -353,12 +353,6 @@ class GemmaModel(GemmaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -444,18 +438,6 @@ class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -547,12 +529,6 @@ class GemmaForSequenceClassification(GemmaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -639,12 +615,6 @@ class GemmaForTokenClassification(GemmaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index df45009a92..6db1b1f7bb 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -383,12 +383,6 @@ class Gemma2Model(Gemma2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -516,18 +510,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -638,12 +620,6 @@ class Gemma2ForSequenceClassification(Gemma2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -730,12 +706,6 @@ class Gemma2ForTokenClassification(Gemma2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index a7f4e238f5..c02f2862c5 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -488,12 +488,6 @@ class Gemma3TextModel(Gemma3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -621,18 +615,6 @@ class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -991,12 +973,6 @@ class Gemma3ForConditionalGeneration(Gemma3PreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 8b9b516d92..1411cccef9 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -1582,12 +1582,6 @@ class Gemma3nTextModel(Gemma3nPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1793,18 +1787,6 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -2212,12 +2194,6 @@ class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index d6cf8e137f..72733a80f7 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -370,12 +370,6 @@ class GlmModel(GlmPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -452,18 +446,6 @@ class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -560,12 +542,6 @@ class GlmForSequenceClassification(GlmPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -652,12 +628,6 @@ class GlmForTokenClassification(GlmPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/glm4/modeling_glm4.py b/src/transformers/models/glm4/modeling_glm4.py index c3c9a0ab1f..e4dd64102d 100644 --- a/src/transformers/models/glm4/modeling_glm4.py +++ b/src/transformers/models/glm4/modeling_glm4.py @@ -374,12 +374,6 @@ class Glm4Model(Glm4PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -456,18 +450,6 @@ class Glm4ForCausalLM(Glm4PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -564,12 +546,6 @@ class Glm4ForSequenceClassification(Glm4PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -656,12 +632,6 @@ class Glm4ForTokenClassification(Glm4PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/glm4_moe/modeling_glm4_moe.py b/src/transformers/models/glm4_moe/modeling_glm4_moe.py index 90a5d85237..31ad8ede95 100644 --- a/src/transformers/models/glm4_moe/modeling_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modeling_glm4_moe.py @@ -480,12 +480,6 @@ class Glm4MoeModel(Glm4MoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -562,18 +556,6 @@ class Glm4MoeForCausalLM(Glm4MoePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py index 41f6e34ad2..41e37b3e1a 100644 --- a/src/transformers/models/glm4v/modeling_glm4v.py +++ b/src/transformers/models/glm4v/modeling_glm4v.py @@ -837,12 +837,6 @@ class Glm4vTextModel(Glm4vPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring @can_return_tuple def forward( @@ -1406,12 +1400,6 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py index dc6f81945b..99959cd74b 100644 --- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py @@ -682,9 +682,6 @@ class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin): def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 7039169519..c853d80e4a 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -1139,12 +1139,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel, GenerationMixin): self.model_parallel = False torch.cuda.empty_cache() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, @@ -1294,12 +1288,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel, GenerationMixin): self.model_parallel = False torch.cuda.empty_cache() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index 89c280f93a..8855d6a593 100644 --- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -618,12 +618,6 @@ class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index cce119303a..7d655bd0e6 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -788,12 +788,6 @@ class GPTNeoForCausalLM(GPTNeoPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index b868d4353c..15cc664d74 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -403,12 +403,6 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_in - - def set_input_embeddings(self, value): - self.embed_in = value - @check_model_inputs @auto_docstring def forward( @@ -521,6 +515,12 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel): attentions=all_attentions, ) + def get_input_embeddings(self): + return self.embed_in + + def set_input_embeddings(self, value): + self.embed_in = value + @auto_docstring( custom_intro=""" diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 093dbf355d..43822682df 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -927,12 +927,6 @@ class GPTJForCausalLM(GPTJPreTrainedModel, GenerationMixin): self.model_parallel = False torch.cuda.empty_cache() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py index ce4bbc6d2f..d9327bb50a 100644 --- a/src/transformers/models/gptj/modeling_tf_gptj.py +++ b/src/transformers/models/gptj/modeling_tf_gptj.py @@ -751,12 +751,6 @@ class TFGPTJForCausalLM(TFGPTJPreTrainedModel, TFCausalLanguageModelingLoss): ) self.config = config - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs): token_type_ids = kwargs.get("token_type_ids", None) # only last token for inputs_ids if past is defined in kwargs diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index 13804b9c26..8bebef03c2 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -383,12 +383,6 @@ class GraniteModel(GranitePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -504,18 +498,6 @@ class GraniteForCausalLM(GranitePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index caa9214183..bf72cc85da 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -636,12 +636,6 @@ class GraniteMoeModel(GraniteMoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -894,18 +888,6 @@ class GraniteMoeForCausalLM(GraniteMoePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py index df823c30bd..8b3f3d1dcc 100644 --- a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py @@ -1298,12 +1298,6 @@ class GraniteMoeHybridModel(GraniteMoeHybridPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1654,18 +1648,6 @@ class GraniteMoeHybridForCausalLM(GraniteMoeHybridPreTrainedModel, GenerationMix # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py index 7bd81f2db3..83f78ae327 100644 --- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py @@ -588,12 +588,6 @@ class GraniteMoeSharedModel(GraniteMoeSharedPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -926,18 +920,6 @@ class GraniteMoeSharedForCausalLM(GraniteMoeSharedPreTrainedModel, GenerationMix # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py index f68f810dda..f15fbd48dd 100644 --- a/src/transformers/models/helium/modeling_helium.py +++ b/src/transformers/models/helium/modeling_helium.py @@ -355,12 +355,6 @@ class HeliumModel(HeliumPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -437,18 +431,6 @@ class HeliumForCausalLM(HeliumPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -540,12 +522,6 @@ class HeliumForSequenceClassification(HeliumPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -632,12 +608,6 @@ class HeliumForTokenClassification(HeliumPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index ec752c2b1d..741886c142 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -993,12 +993,6 @@ class IdeficsModel(IdeficsPreTrainedModel): def freeze_vision_layers(self, module_exceptions=[]): freeze_model(self.vision_model, module_exceptions=module_exceptions) - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1344,18 +1338,6 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/idefics/modeling_tf_idefics.py b/src/transformers/models/idefics/modeling_tf_idefics.py index 769c297e51..f006862284 100644 --- a/src/transformers/models/idefics/modeling_tf_idefics.py +++ b/src/transformers/models/idefics/modeling_tf_idefics.py @@ -1223,12 +1223,6 @@ class TFIdeficsMainLayer(tf.keras.layers.Layer): def freeze_vision_layers(self, module_exceptions=[]): freeze_model(self.vision_model, module_exceptions=module_exceptions) - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] @@ -1613,18 +1607,6 @@ class TFIdeficsForVisionText2Text(TFPreTrainedModel, TFCausalLanguageModelingLos name="lm_head", ) - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 60022d781a..a2afefa827 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1141,12 +1141,6 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel, GenerationMixin) def set_input_embeddings(self, value): self.model.text_model.set_input_embeddings(value) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None): return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask) diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index bb0cd4f70f..9200bb7159 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -869,14 +869,6 @@ class Idefics3ForConditionalGeneration(Idefics3PreTrainedModel, GenerationMixin) def set_input_embeddings(self, value): self.model.text_model.set_input_embeddings(value) - # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - - # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None): return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask) diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py index 911fc4d72d..cf9d4339fa 100755 --- a/src/transformers/models/imagegpt/modeling_imagegpt.py +++ b/src/transformers/models/imagegpt/modeling_imagegpt.py @@ -799,12 +799,6 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py index 983d16ef03..46ef56dc46 100644 --- a/src/transformers/models/internvl/modeling_internvl.py +++ b/src/transformers/models/internvl/modeling_internvl.py @@ -843,9 +843,6 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin) def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 52c5d7828a..d6a2aaabd8 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -1125,12 +1125,6 @@ class JambaModel(JambaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1298,18 +1292,6 @@ class JambaForCausalLM(JambaPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -1495,12 +1477,6 @@ class JambaForSequenceClassification(JambaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py index 3cd578bc80..2a2257dfec 100644 --- a/src/transformers/models/janus/modeling_janus.py +++ b/src/transformers/models/janus/modeling_janus.py @@ -1145,12 +1145,6 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin): hidden_state = self.model.generation_aligner(hidden_state) return hidden_state - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index 313b431329..81e6390183 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -1004,12 +1004,6 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin): hidden_state = self.model.generation_aligner(hidden_state) return hidden_state - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index 224d35f97b..996eece489 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -875,14 +875,6 @@ class JetMoeModel(JetMoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - # Copied from transformers.models.llama.modeling_llama.LlamaModel.get_input_embeddings - def get_input_embeddings(self): - return self.embed_tokens - - # Copied from transformers.models.llama.modeling_llama.LlamaModel.set_input_embeddings - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1130,22 +1122,6 @@ class JetMoeForCausalLM(JetMoePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings - def get_input_embeddings(self): - return self.model.embed_tokens - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder def set_decoder(self, decoder): self.model = decoder @@ -1258,12 +1234,6 @@ class JetMoeForSequenceClassification(JetMoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index a6cf92bfd0..67d53f2a7a 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -1254,9 +1254,6 @@ class Kosmos2TextModel(Kosmos2PreTrainedModel): def get_input_embeddings(self) -> nn.Module: return self.model.embed_tokens - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1337,15 +1334,9 @@ class Kosmos2TextForCausalLM(Kosmos2PreTrainedModel, GenerationMixin): def get_input_embeddings(self) -> nn.Module: return self.model.embed_tokens - def set_input_embeddings(self, value): - self.model.embed_tokens = value - def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py index 5892d76490..d2e9d92e78 100644 --- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py @@ -815,12 +815,6 @@ class KyutaiSpeechToTextModel(KyutaiSpeechToTextPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -1094,18 +1088,6 @@ class KyutaiSpeechToTextForConditionalGeneration(KyutaiSpeechToTextPreTrainedMod # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index ae0f361bd4..f0af302bf8 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -2054,12 +2054,6 @@ class LEDForConditionalGeneration(LEDPreTrainedModel, GenerationMixin): new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/lfm2/modeling_lfm2.py b/src/transformers/models/lfm2/modeling_lfm2.py index 99b2730507..0d383769d1 100644 --- a/src/transformers/models/lfm2/modeling_lfm2.py +++ b/src/transformers/models/lfm2/modeling_lfm2.py @@ -582,12 +582,6 @@ class Lfm2Model(Lfm2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -669,18 +663,6 @@ class Lfm2ForCausalLM(Lfm2PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index ee7c72aabb..4bab75a87c 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -354,12 +354,6 @@ class LlamaModel(LlamaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -436,18 +430,6 @@ class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -544,12 +526,6 @@ class LlamaForSequenceClassification(LlamaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -697,12 +673,6 @@ class LlamaForTokenClassification(LlamaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py index 3be5760ae2..85aeb70ce3 100644 --- a/src/transformers/models/llama4/modeling_llama4.py +++ b/src/transformers/models/llama4/modeling_llama4.py @@ -489,12 +489,6 @@ class Llama4TextModel(Llama4PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -619,18 +613,6 @@ class Llama4ForCausalLM(Llama4PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 58331bdff0..032751a4e1 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -351,9 +351,6 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin): def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 03fa7015d3..7cbad1b980 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -552,9 +552,6 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index c28a9c1565..7721d760ea 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -689,9 +689,6 @@ class LlavaNextVideoForConditionalGeneration(LlavaNextVideoPreTrainedModel, Gene def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 2d07527f48..ea5ca1e5ea 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -722,9 +722,6 @@ class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, Gene def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index 871e75a129..d4e29c619b 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -1367,10 +1367,6 @@ class LongT5Stack(LongT5PreTrainedModel): # Initialize weights and apply final processing self.post_init() - # Copied from transformers.models.t5.modeling_t5.T5Stack.get_input_embeddings - def get_input_embeddings(self): - return self.embed_tokens - # Copied from transformers.models.t5.modeling_t5.T5Stack.set_input_embeddings def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -1929,12 +1925,6 @@ class LongT5ForConditionalGeneration(LongT5PreTrainedModel, GenerationMixin): self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared) - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def get_output_embeddings(self): - return self.lm_head - def get_encoder(self): return self.encoder diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index b45fb0e68b..6790872107 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -1330,12 +1330,6 @@ class M2M100ForConditionalGeneration(M2M100PreTrainedModel, GenerationMixin): def get_decoder(self): return self.model.get_decoder() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py index 06d87b4d5c..83de5dd6ae 100644 --- a/src/transformers/models/mamba/modeling_mamba.py +++ b/src/transformers/models/mamba/modeling_mamba.py @@ -714,12 +714,6 @@ class MambaForCausalLM(MambaPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_input_embeddings(self): return self.backbone.get_input_embeddings() diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py index 5a83186fb0..7809707d4c 100644 --- a/src/transformers/models/mamba2/modeling_mamba2.py +++ b/src/transformers/models/mamba2/modeling_mamba2.py @@ -947,12 +947,6 @@ class Mamba2ForCausalLM(Mamba2PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_input_embeddings(self): return self.backbone.get_input_embeddings() diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index 3bcc64db47..5f988b3a82 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -724,12 +724,6 @@ class MarianEncoder(MarianPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -881,12 +875,6 @@ class MarianDecoder(MarianPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -1424,9 +1412,6 @@ class MarianMTModel(MarianPreTrainedModel, GenerationMixin): new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) - def get_output_embeddings(self): - return self.lm_head - def set_output_embeddings(self, new_embeddings: nn.Embedding): self.lm_head = new_embeddings @@ -1617,12 +1602,6 @@ class MarianForCausalLM(MarianPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.decoder = decoder diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index c02de1d4fe..0a6880415f 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -920,12 +920,6 @@ class MBartDecoder(MBartPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -1341,12 +1335,6 @@ class MBartForConditionalGeneration(MBartPreTrainedModel, GenerationMixin): new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, @@ -1814,12 +1802,6 @@ class MBartForCausalLM(MBartPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.decoder = decoder diff --git a/src/transformers/models/minimax/modeling_minimax.py b/src/transformers/models/minimax/modeling_minimax.py index 40ef35a3a6..6923fdc91a 100644 --- a/src/transformers/models/minimax/modeling_minimax.py +++ b/src/transformers/models/minimax/modeling_minimax.py @@ -663,12 +663,6 @@ class MiniMaxModel(MiniMaxPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -845,18 +839,6 @@ class MiniMaxForCausalLM(MiniMaxPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -974,12 +956,6 @@ class MiniMaxForSequenceClassification(MiniMaxPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1066,12 +1042,6 @@ class MiniMaxForTokenClassification(MiniMaxPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 29727dc415..1189f1b34d 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -333,12 +333,6 @@ class MistralModel(MistralPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -416,18 +410,6 @@ class MistralForCausalLM(MistralPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -518,12 +500,6 @@ class MistralForTokenClassification(MistralPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -593,12 +569,6 @@ class MistralForSequenceClassification(MistralPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/mistral/modeling_tf_mistral.py b/src/transformers/models/mistral/modeling_tf_mistral.py index d141d22265..b9afd8ab26 100644 --- a/src/transformers/models/mistral/modeling_tf_mistral.py +++ b/src/transformers/models/mistral/modeling_tf_mistral.py @@ -499,12 +499,6 @@ class TFMistralMainLayer(keras.layers.Layer): self.norm = TFMistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps, name="norm") self.config = config - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] @@ -815,18 +809,6 @@ class TFMistralForCausalLM(TFMistralPreTrainedModel, TFCausalLanguageModelingLos ) self.config = config - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -952,12 +934,6 @@ class TFMistralForSequenceClassification(TFMistralPreTrainedModel, TFSequenceCla ) self.config = config - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @unpack_inputs @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def call( diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py index 769697ada0..d1a9c83f9d 100644 --- a/src/transformers/models/mistral3/modeling_mistral3.py +++ b/src/transformers/models/mistral3/modeling_mistral3.py @@ -390,9 +390,6 @@ class Mistral3ForConditionalGeneration(Mistral3PreTrainedModel, GenerationMixin) def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 671bc0390a..ec18500269 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -428,12 +428,6 @@ class MixtralModel(MixtralPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -599,18 +593,6 @@ class MixtralForCausalLM(MixtralPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -728,12 +710,6 @@ class MixtralForSequenceClassification(MixtralPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -820,12 +796,6 @@ class MixtralForTokenClassification(MixtralPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index b520b75001..7c126f42f1 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -1273,12 +1273,6 @@ class MllamaTextModel(MllamaPreTrainedModel): self.gradient_checkpointing = False self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -1468,18 +1462,6 @@ class MllamaForCausalLM(MllamaPreTrainedModel, GenerationMixin): self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -1775,12 +1757,6 @@ class MllamaForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index 61ab1cbcc5..9b229e4074 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -614,12 +614,6 @@ class MoonshineDecoder(MoonshinePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs def forward( self, diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 94a3a1fa1f..5a9aa1e993 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -1226,12 +1226,6 @@ class MoshiModel(MoshiPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -1503,18 +1497,6 @@ class MoshiForCausalLM(MoshiPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py index 706318cc79..849b3c4851 100644 --- a/src/transformers/models/mpt/modeling_mpt.py +++ b/src/transformers/models/mpt/modeling_mpt.py @@ -438,9 +438,6 @@ class MptForCausalLM(MptPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - def set_output_embeddings(self, new_embeddings: torch.Tensor): self.lm_head = new_embeddings diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index 6b9be40ebf..a9d0fd9781 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -927,9 +927,6 @@ class MT5Stack(MT5PreTrainedModel): self.final_layer_norm = self.final_layer_norm.to("cpu") torch.cuda.empty_cache() - def get_input_embeddings(self): - return self.embed_tokens - def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -1633,24 +1630,14 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel, GenerationMixin): self.device_map = None torch.cuda.empty_cache() - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_input_embeddings def get_input_embeddings(self): return self.shared - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_input_embeddings def set_input_embeddings(self, new_embeddings): self.shared = new_embeddings self.encoder.set_input_embeddings(new_embeddings) self.decoder.set_input_embeddings(new_embeddings) - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_encoder def get_encoder(self): return self.encoder diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index c2e8e430c7..64edaee56f 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -476,12 +476,6 @@ class MusicgenDecoder(MusicgenPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index fce8bea6b4..a454d9fe24 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -442,12 +442,6 @@ class MusicgenMelodyDecoder(MusicgenMelodyPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring # Ignore copy def forward( diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index 370afff391..52da33565d 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -560,12 +560,6 @@ class MvpEncoder(MvpPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -747,12 +741,6 @@ class MvpDecoder(MvpPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -1165,12 +1153,6 @@ class MvpForConditionalGeneration(MvpPreTrainedModel, GenerationMixin): new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_lightweight_tuning(self): self.model.set_lightweight_tuning() self.lm_head.requires_grad_(False) @@ -1695,12 +1677,6 @@ class MvpForCausalLM(MvpPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.decoder = decoder diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index 8da4a3bde8..df64eec95c 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -631,12 +631,6 @@ class NemotronModel(NemotronPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -869,18 +863,6 @@ class NemotronForCausalLM(NemotronPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -987,12 +969,6 @@ class NemotronForSequenceClassification(NemotronPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1142,12 +1118,6 @@ class NemotronForTokenClassification(NemotronPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py index c8c927ed58..909993b5d3 100644 --- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py +++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py @@ -1605,12 +1605,6 @@ class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel, GenerationMixin): def get_decoder(self): return self.model.get_decoder() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index ef66f62236..817341cc25 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -331,12 +331,6 @@ class OlmoModel(OlmoPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -413,18 +407,6 @@ class OlmoForCausalLM(OlmoPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/olmo2/modeling_olmo2.py b/src/transformers/models/olmo2/modeling_olmo2.py index fde7f95adf..b589364df5 100644 --- a/src/transformers/models/olmo2/modeling_olmo2.py +++ b/src/transformers/models/olmo2/modeling_olmo2.py @@ -338,12 +338,6 @@ class Olmo2Model(Olmo2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -420,18 +414,6 @@ class Olmo2ForCausalLM(Olmo2PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index 89ebcb2013..8caff4d9ab 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -740,12 +740,6 @@ class OlmoeModel(OlmoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -992,18 +986,6 @@ class OlmoeForCausalLM(OlmoePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py index 074af9ce11..27c84910cb 100644 --- a/src/transformers/models/openai/modeling_openai.py +++ b/src/transformers/models/openai/modeling_openai.py @@ -542,12 +542,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, @@ -633,12 +627,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index e275168017..2fe9a14677 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -375,12 +375,6 @@ class OPTDecoder(OPTPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask def _update_causal_mask( self, @@ -775,12 +769,6 @@ class OPTForCausalLM(OPTPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.decoder = decoder diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py index 38fb203d4c..b7c817a034 100644 --- a/src/transformers/models/opt/modeling_tf_opt.py +++ b/src/transformers/models/opt/modeling_tf_opt.py @@ -544,9 +544,6 @@ class TFOPTDecoder(keras.layers.Layer): self.embed_tokens.vocab_size = new_embeddings.shape[0] self.embed_tokens.weight = new_embeddings - def get_input_embeddings(self): - return self.embed_tokens - def _prepare_decoder_attention_mask(self, attention_mask, input_shape, past_key_values_length): # create causal mask # # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index f10ece1c19..581269653c 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -404,12 +404,6 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixi def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index f3c5bc8a4f..05e34da4d2 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -896,12 +896,6 @@ class PegasusDecoder(PegasusPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def resize_position_embeddings(self, new_num_position_embeddings: int): """ Resizes position embeddings matrix of the model if `new_num_position_embeddings != @@ -1372,12 +1366,6 @@ class PegasusForConditionalGeneration(PegasusPreTrainedModel, GenerationMixin): new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def resize_position_embeddings(self, new_num_position_embeddings: int): """ Resizes position embeddings matrix of the model if `new_num_position_embeddings != @@ -1561,12 +1549,6 @@ class PegasusForCausalLM(PegasusPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.decoder = decoder diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py index 46d3730695..ee48a02b04 100755 --- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py +++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py @@ -1200,12 +1200,6 @@ class PegasusXDecoder(PegasusXPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids=None, @@ -1596,12 +1590,6 @@ class PegasusXForConditionalGeneration(PegasusXPreTrainedModel, GenerationMixin) def get_decoder(self): return self.model.get_decoder() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def resize_position_embeddings(self, new_num_position_embeddings: int): """ Resizes position embeddings matrix of the model if `new_num_position_embeddings != diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py index d646569ff0..657aa80569 100644 --- a/src/transformers/models/perception_lm/modeling_perception_lm.py +++ b/src/transformers/models/perception_lm/modeling_perception_lm.py @@ -326,9 +326,6 @@ class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, Generati def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index cb3313753e..64f3bdd7b5 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -435,12 +435,6 @@ class PersimmonModel(PersimmonPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -676,22 +670,6 @@ class PersimmonForCausalLM(PersimmonPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings - def get_input_embeddings(self): - return self.model.embed_tokens - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder def set_decoder(self, decoder): self.model = decoder @@ -807,12 +785,6 @@ class PersimmonForSequenceClassification(PersimmonPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -900,12 +872,6 @@ class PersimmonForTokenClassification(PersimmonPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 2ca954635b..ea77b2d471 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -340,12 +340,6 @@ class PhiModel(PhiPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -460,18 +454,6 @@ class PhiForCausalLM(PhiPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -568,12 +550,6 @@ class PhiForSequenceClassification(PhiPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -660,12 +636,6 @@ class PhiForTokenClassification(PhiPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 35efc8d3db..c896def491 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -365,12 +365,6 @@ class Phi3Model(Phi3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -448,18 +442,6 @@ class Phi3ForCausalLM(Phi3PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -595,12 +577,6 @@ class Phi3ForSequenceClassification(Phi3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -687,12 +663,6 @@ class Phi3ForTokenClassification(Phi3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py index 301bd5a846..855e8b7fc1 100644 --- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py @@ -1641,12 +1641,6 @@ class Phi4MultimodalModel(Phi4MultimodalPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs def forward( self, @@ -1757,18 +1751,6 @@ class Phi4MultimodalForCausalLM(Phi4MultimodalPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index 4f8a9f2d28..0662acf7e6 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -932,12 +932,6 @@ class PhimoeModel(PhimoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1213,22 +1207,6 @@ class PhimoeForCausalLM(PhimoePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings - def get_input_embeddings(self): - return self.model.embed_tokens - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder def set_decoder(self, decoder): self.model = decoder @@ -1399,12 +1377,6 @@ class PhimoeForSequenceClassification(PhimoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index 9eee774be2..5d080e8f0c 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -1037,18 +1037,9 @@ class Pix2StructTextModel(Pix2StructPreTrainedModel): self.post_init() self.gradient_checkpointing = False - def get_input_embeddings(self): - return self.embed_tokens - def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index 07cb8835bb..4236476349 100644 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -575,12 +575,6 @@ class PLBartEncoder(PLBartPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -857,12 +851,6 @@ class PLBartDecoder(PLBartPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -1298,12 +1286,6 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel, GenerationMixin): new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, @@ -1645,12 +1627,6 @@ class PLBartForCausalLM(PLBartPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.decoder.embed_tokens = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.decoder = decoder diff --git a/src/transformers/models/plbart/modular_plbart.py b/src/transformers/models/plbart/modular_plbart.py index 3547b1da40..3de32a625a 100644 --- a/src/transformers/models/plbart/modular_plbart.py +++ b/src/transformers/models/plbart/modular_plbart.py @@ -464,12 +464,6 @@ class PLBartForConditionalGeneration(PLBartPreTrainedModel, GenerationMixin): new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) self.register_buffer("final_logits_bias", new_bias) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py index aab17019ed..795dfb5874 100644 --- a/src/transformers/models/pop2piano/modeling_pop2piano.py +++ b/src/transformers/models/pop2piano/modeling_pop2piano.py @@ -678,10 +678,6 @@ class Pop2PianoStack(Pop2PianoPreTrainedModel): self.device_map = None self.gradient_checkpointing = False - # Copied from transformers.models.t5.modeling_t5.T5Stack.get_input_embeddings - def get_input_embeddings(self): - return self.embed_tokens - # Copied from transformers.models.t5.modeling_t5.T5Stack.set_input_embeddings def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -1045,12 +1041,6 @@ class Pop2PianoForConditionalGeneration(Pop2PianoPreTrainedModel, GenerationMixi self.encoder.set_input_embeddings(new_embeddings) self.decoder.set_input_embeddings(new_embeddings) - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def get_output_embeddings(self): - return self.lm_head - def get_encoder(self): return self.encoder diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py index 467194eafd..a7558b16ed 100644 --- a/src/transformers/models/prophetnet/modeling_prophetnet.py +++ b/src/transformers/models/prophetnet/modeling_prophetnet.py @@ -1637,12 +1637,6 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel, GenerationMi # Initialize weights and apply final processing self.post_init() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def _tie_weights(self): if self.config.tie_word_embeddings: self._tie_or_clone_weights(self.prophetnet.word_embeddings, self.lm_head) @@ -1846,12 +1840,6 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.prophetnet.decoder.word_embeddings = value - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def _tie_weights(self): if self.config.tie_word_embeddings: self._tie_or_clone_weights(self.prophetnet.decoder.word_embeddings, self.lm_head) diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index eddfef4ced..e8c5a1bc8a 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -337,12 +337,6 @@ class Qwen2Model(Qwen2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -432,18 +426,6 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -540,12 +522,6 @@ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -632,12 +608,6 @@ class Qwen2ForTokenClassification(Qwen2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py index 8174f22af2..37c0c6da43 100644 --- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py @@ -1553,12 +1553,6 @@ class Qwen2_5OmniThinkerTextModel(Qwen2_5OmniPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -1702,6 +1696,7 @@ class Qwen2_5OmniThinkerTextModel(Qwen2_5OmniPreTrainedModel): class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration, GenerationMixin): config: Qwen2_5OmniThinkerConfig base_model_prefix = "thinker" + _tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"] _no_split_modules = ["Qwen2_5OmniAudioEncoder", "Qwen2_5OmniVisionEncoder"] def __init__(self, config: Qwen2_5OmniThinkerConfig): @@ -2110,12 +2105,6 @@ class Qwen2_5OmniTalkerModel(Qwen2_5OmniPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 5b501cf7f9..d40b0a073c 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -2142,6 +2142,7 @@ class Qwen2_5OmniThinkerTextModel(Qwen2_5_VLTextModel): class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForConditionalGeneration, GenerationMixin): config: Qwen2_5OmniThinkerConfig base_model_prefix = "thinker" + _tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"] _no_split_modules = ["Qwen2_5OmniAudioEncoder", "Qwen2_5OmniVisionEncoder"] def __init__(self, config: Qwen2_5OmniThinkerConfig): diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 723fde6d84..66fb7a7c06 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -824,12 +824,6 @@ class Qwen2_5_VLTextModel(Qwen2_5_VLPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -1402,12 +1396,6 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index dc70ad547f..ff6eb1908d 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -780,12 +780,6 @@ class Qwen2MoeModel(Qwen2MoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1065,18 +1059,6 @@ class Qwen2MoeForCausalLM(Qwen2MoePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -1202,12 +1184,6 @@ class Qwen2MoeForSequenceClassification(Qwen2MoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1295,12 +1271,6 @@ class Qwen2MoeForTokenClassification(Qwen2MoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1347,7 +1317,6 @@ class Qwen2MoeForTokenClassification(Qwen2MoePreTrainedModel): @auto_docstring -# Copied from transformers.models.mistral.modeling_mistral.MistralForQuestionAnswering with Mistral->Qwen2Moe, MISTRAL->QWEN2MOE, BaseModelOutputWithPast->MoeModelOutputWithPast class Qwen2MoeForQuestionAnswering(Qwen2MoePreTrainedModel): base_model_prefix = "model" @@ -1359,12 +1328,6 @@ class Qwen2MoeForQuestionAnswering(Qwen2MoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index cdda0d6938..b14264f455 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -799,12 +799,6 @@ class Qwen2VLTextModel(Qwen2VLPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -1295,12 +1289,6 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.set_input_embeddings(value) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/qwen3/modeling_qwen3.py b/src/transformers/models/qwen3/modeling_qwen3.py index 4af78a109f..73f0631480 100644 --- a/src/transformers/models/qwen3/modeling_qwen3.py +++ b/src/transformers/models/qwen3/modeling_qwen3.py @@ -363,12 +363,6 @@ class Qwen3Model(Qwen3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -458,18 +452,6 @@ class Qwen3ForCausalLM(Qwen3PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -566,12 +548,6 @@ class Qwen3ForSequenceClassification(Qwen3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -658,12 +634,6 @@ class Qwen3ForTokenClassification(Qwen3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py index 581dd2faba..f37568777b 100644 --- a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -451,12 +451,6 @@ class Qwen3MoeModel(Qwen3MoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -622,18 +616,6 @@ class Qwen3MoeForCausalLM(Qwen3MoePreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -751,12 +733,6 @@ class Qwen3MoeForSequenceClassification(Qwen3MoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -843,12 +819,6 @@ class Qwen3MoeForTokenClassification(Qwen3MoePreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py index 833b1689c2..2385f8e623 100644 --- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -587,14 +587,6 @@ class RecurrentGemmaModel(RecurrentGemmaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - # Copied from transformers.models.llama.modeling_llama.LlamaModel.get_input_embeddings - def get_input_embeddings(self): - return self.embed_tokens - - # Copied from transformers.models.llama.modeling_llama.LlamaModel.set_input_embeddings - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -705,18 +697,6 @@ class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py index 6c1b827d8f..1c47aec148 100755 --- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py @@ -1753,12 +1753,6 @@ class SeamlessM4TDecoder(SeamlessM4TPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -2025,12 +2019,6 @@ class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel, def get_decoder(self): return self.model.decoder - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_input_embeddings(self): return self.model.decoder.embed_tokens @@ -2498,12 +2486,6 @@ class SeamlessM4TForTextToText(SeamlessM4TPreTrainedModel, GenerationMixin): def get_decoder(self): return self.text_decoder - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_input_embeddings(self): return self.text_decoder.embed_tokens @@ -2759,12 +2741,6 @@ class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel, GenerationMixin): def get_decoder(self): return self.text_decoder - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_input_embeddings(self): return self.text_decoder.embed_tokens @@ -3031,12 +3007,6 @@ class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin): def get_decoder(self): return self.text_decoder - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_input_embeddings(self): return self.text_decoder.embed_tokens @@ -3358,12 +3328,6 @@ class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin): def get_decoder(self): return self.text_decoder - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_input_embeddings(self): return self.text_decoder.embed_tokens @@ -3714,12 +3678,6 @@ class SeamlessM4TModel(SeamlessM4TPreTrainedModel, GenerationMixin): else: return self.speech_encoder - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_input_embeddings(self): return self.text_decoder.embed_tokens diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py index 38b736d23e..6160d311c8 100644 --- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py @@ -1796,12 +1796,6 @@ class SeamlessM4Tv2Decoder(SeamlessM4Tv2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -2001,12 +1995,6 @@ class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, char_input_ids: Optional[torch.LongTensor] = None, @@ -2236,14 +2224,6 @@ class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedMod def get_decoder(self): return self.model.decoder - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_input_embeddings def get_input_embeddings(self): return self.model.decoder.embed_tokens @@ -2714,12 +2694,6 @@ class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin): def get_decoder(self): return self.text_decoder - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_input_embeddings(self): return self.text_decoder.embed_tokens @@ -2978,14 +2952,6 @@ class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin def get_decoder(self): return self.text_decoder - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_input_embeddings def get_input_embeddings(self): return self.text_decoder.embed_tokens @@ -3260,14 +3226,6 @@ class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin def get_decoder(self): return self.text_decoder - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_input_embeddings def get_input_embeddings(self): return self.text_decoder.embed_tokens @@ -3627,14 +3585,6 @@ class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMix def get_decoder(self): return self.text_decoder - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_input_embeddings def get_input_embeddings(self): return self.text_decoder.embed_tokens @@ -4022,14 +3972,6 @@ class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin): else: return self.speech_encoder - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_input_embeddings def get_input_embeddings(self): return self.text_decoder.embed_tokens diff --git a/src/transformers/models/smollm3/modeling_smollm3.py b/src/transformers/models/smollm3/modeling_smollm3.py index c81a204263..92a3205a7b 100644 --- a/src/transformers/models/smollm3/modeling_smollm3.py +++ b/src/transformers/models/smollm3/modeling_smollm3.py @@ -367,12 +367,6 @@ class SmolLM3Model(SmolLM3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs @auto_docstring def forward( @@ -462,18 +456,6 @@ class SmolLM3ForCausalLM(SmolLM3PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -570,12 +552,6 @@ class SmolLM3ForSequenceClassification(SmolLM3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -662,12 +638,6 @@ class SmolLM3ForTokenClassification(SmolLM3PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py index dc1a6290d7..d83fe01c14 100644 --- a/src/transformers/models/smolvlm/modeling_smolvlm.py +++ b/src/transformers/models/smolvlm/modeling_smolvlm.py @@ -843,12 +843,6 @@ class SmolVLMForConditionalGeneration(SmolVLMPreTrainedModel, GenerationMixin): def set_input_embeddings(self, value): self.model.text_model.set_input_embeddings(value) - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def get_image_features(self, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None): return self.model.get_image_features(pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask) diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 63e4cca6f3..d85b75b8f2 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -761,12 +761,6 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids=None, @@ -1220,12 +1214,6 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel, Generation def get_decoder(self): return self.model.get_decoder() - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index e0fc6dd47d..81fbf13daf 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -1428,12 +1428,6 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus new_embeddings = super().resize_token_embeddings(new_num_tokens) return new_embeddings - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @unpack_inputs @add_start_docstrings_to_model_forward(SPEECH_TO_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index 4d6e5da03e..15d044d0f0 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -37,7 +37,7 @@ from ...modeling_outputs import ( Seq2SeqModelOutput, Seq2SeqSpectrogramOutput, ) -from ...modeling_utils import PreTrainedModel +from ...modeling_utils import EmbeddingAccessMixin, PreTrainedModel from ...utils import auto_docstring, logging from .configuration_speecht5 import SpeechT5Config, SpeechT5HifiGanConfig @@ -762,7 +762,7 @@ class SpeechT5SpeechDecoderPostnet(nn.Module): return hidden_states + layer_output.transpose(1, 2) -class SpeechT5TextEncoderPrenet(nn.Module): +class SpeechT5TextEncoderPrenet(nn.Module, EmbeddingAccessMixin): def __init__(self, config): super().__init__() self.config = config @@ -773,19 +773,13 @@ class SpeechT5TextEncoderPrenet(nn.Module): config.max_text_positions, ) - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward(self, input_ids: torch.Tensor): inputs_embeds = self.embed_tokens(input_ids) inputs_embeds = self.encode_positions(inputs_embeds) return inputs_embeds -class SpeechT5TextDecoderPrenet(nn.Module): +class SpeechT5TextDecoderPrenet(nn.Module, EmbeddingAccessMixin): def __init__(self, config): super().__init__() self.config = config @@ -800,12 +794,6 @@ class SpeechT5TextDecoderPrenet(nn.Module): config.pad_token_id, ) - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: torch.Tensor, @@ -835,7 +823,7 @@ class SpeechT5TextDecoderPrenet(nn.Module): return inputs_embeds, attention_mask -class SpeechT5TextDecoderPostnet(nn.Module): +class SpeechT5TextDecoderPostnet(nn.Module, EmbeddingAccessMixin): def __init__(self, config): super().__init__() self.config = config @@ -845,6 +833,8 @@ class SpeechT5TextDecoderPostnet(nn.Module): return self.lm_head(hidden_states) def get_output_embeddings(self): + # Post-net has no token embeddings, but its lm_head must still be + # tied to the decoder weights when `tie_word_embeddings=True`. return self.lm_head def set_output_embeddings(self, new_embeddings): diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 89fe06748c..b7a61f3360 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -664,12 +664,6 @@ class StableLmModel(StableLmPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -904,22 +898,6 @@ class StableLmForCausalLM(StableLmPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings - def get_input_embeddings(self): - return self.model.embed_tokens - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder def set_decoder(self, decoder): self.model = decoder @@ -1035,12 +1013,6 @@ class StableLmForSequenceClassification(StableLmPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -1128,12 +1100,6 @@ class StableLmForTokenClassification(StableLmPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 19208dbda1..9e574d5349 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -338,12 +338,6 @@ class Starcoder2Model(Starcoder2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs def forward( self, @@ -426,18 +420,6 @@ class Starcoder2ForCausalLM(Starcoder2PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -534,12 +516,6 @@ class Starcoder2ForSequenceClassification(Starcoder2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( @@ -626,12 +602,6 @@ class Starcoder2ForTokenClassification(Starcoder2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index 2c5a5694e4..d71dffb987 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -887,9 +887,6 @@ class SwitchTransformersStack(SwitchTransformersPreTrainedModel): self.device_map = None self.gradient_checkpointing = False - def get_input_embeddings(self): - return self.embed_tokens - def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -1478,12 +1475,6 @@ class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedMod self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared) - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def get_output_embeddings(self): - return self.lm_head - def get_encoder(self): return self.encoder diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index ba5741a904..b5ff699f69 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -940,9 +940,6 @@ class T5Stack(T5PreTrainedModel): self.final_layer_norm = self.final_layer_norm.to("cpu") torch.cuda.empty_cache() - def get_input_embeddings(self): - return self.embed_tokens - def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -1619,12 +1616,6 @@ class T5ForConditionalGeneration(T5PreTrainedModel, GenerationMixin): self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared) - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def get_output_embeddings(self): - return self.lm_head - def get_encoder(self): return self.encoder diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py index cf40994dde..c2fdbf5fc7 100644 --- a/src/transformers/models/t5gemma/modeling_t5gemma.py +++ b/src/transformers/models/t5gemma/modeling_t5gemma.py @@ -707,12 +707,6 @@ class T5GemmaEncoder(T5GemmaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs def forward( self, diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index 4f416ef6b2..5c72d76b4e 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -574,12 +574,6 @@ class T5GemmaEncoder(T5GemmaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @check_model_inputs def forward( self, diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index 1bc780b3d0..fd1cc21fff 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -472,12 +472,6 @@ class TrOCRDecoder(TrOCRPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids=None, diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 27ed407d4a..70a174474b 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -1132,9 +1132,6 @@ class UdopStack(UdopPreTrainedModel): relative_bias_list = create_relative_bias(config) return RelativePositionBiasAggregated(relative_bias_list) - def get_input_embeddings(self): - return self.embed_tokens - def get_output_embeddings(self): return self.embed_tokens @@ -1713,12 +1710,6 @@ class UdopForConditionalGeneration(UdopPreTrainedModel, GenerationMixin): self.encoder.set_input_embeddings(new_embeddings) self.decoder.set_input_embeddings(new_embeddings) - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def get_output_embeddings(self): - return self.lm_head - def get_encoder(self): return self.encoder diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py index b39c63d43a..47b11acfd8 100644 --- a/src/transformers/models/umt5/modeling_umt5.py +++ b/src/transformers/models/umt5/modeling_umt5.py @@ -630,9 +630,6 @@ class UMT5Stack(UMT5PreTrainedModel): self.gradient_checkpointing = False self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - def set_input_embeddings(self, new_embeddings): self.embed_tokens = new_embeddings @@ -1206,14 +1203,6 @@ class UMT5ForConditionalGeneration(UMT5PreTrainedModel, GenerationMixin): self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared) - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_output_embeddings - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_output_embeddings - def get_output_embeddings(self): - return self.lm_head - # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_encoder def get_encoder(self): return self.encoder diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 215e190502..aea7256868 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -430,9 +430,6 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel, GenerationMi def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 14fd0d6376..d3c263807b 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -313,9 +313,6 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel, GenerationMixin) def get_output_embeddings(self) -> nn.Module: return self.lm_head - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model.set_decoder(decoder) diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py index 6accac596a..40ce1e1083 100644 --- a/src/transformers/models/vits/modeling_vits.py +++ b/src/transformers/models/vits/modeling_vits.py @@ -1167,12 +1167,6 @@ class VitsTextEncoder(nn.Module): self.encoder = VitsEncoder(config) self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1) - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids: torch.Tensor, diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index 02f7dc5a9b..a3c71745c5 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -867,12 +867,6 @@ class TFWhisperDecoder(keras.layers.Layer): self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def _prepare_decoder_attention_mask(self, attention_mask, input_shape, past_key_values_length): # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 1ddd286f3f..7b74c4c0b8 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -762,12 +762,6 @@ class WhisperDecoder(WhisperPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - def forward( self, input_ids=None, diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py index 16b661dd12..5915e2e9ae 100644 --- a/src/transformers/models/xglm/modeling_tf_xglm.py +++ b/src/transformers/models/xglm/modeling_tf_xglm.py @@ -885,12 +885,6 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): ) self.config = config - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs): # only last token for inputs_ids if past is defined in kwargs if past_key_values: diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py index d5c8b612ef..d5f9f09187 100755 --- a/src/transformers/models/xglm/modeling_xglm.py +++ b/src/transformers/models/xglm/modeling_xglm.py @@ -425,12 +425,6 @@ class XGLMModel(XGLMPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -617,18 +611,6 @@ class XGLMForCausalLM(XGLMPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - @auto_docstring def forward( self, diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py index 205f4d1eac..b1a8f8c516 100644 --- a/src/transformers/models/zamba/modeling_zamba.py +++ b/src/transformers/models/zamba/modeling_zamba.py @@ -880,12 +880,6 @@ class ZambaModel(ZambaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -1045,18 +1039,6 @@ class ZambaForCausalLM(ZambaPreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -1230,12 +1212,6 @@ class ZambaForSequenceClassification(ZambaPreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @auto_docstring def forward( self, diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py index c932e257df..fe1482bcdf 100644 --- a/src/transformers/models/zamba2/modeling_zamba2.py +++ b/src/transformers/models/zamba2/modeling_zamba2.py @@ -1254,12 +1254,6 @@ class Zamba2Model(Zamba2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - @auto_docstring def forward( self, @@ -1477,18 +1471,6 @@ class Zamba2ForCausalLM(Zamba2PreTrainedModel, GenerationMixin): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - def set_decoder(self, decoder): self.model = decoder @@ -1662,12 +1644,6 @@ class Zamba2ForSequenceClassification(Zamba2PreTrainedModel): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - @auto_docstring def forward( self, diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index c3a8025e0f..642759b00d 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -401,6 +401,10 @@ class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase, Generatio config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) + @unittest.skip(reason="skipped because of dropout") + def test_batching_equivalence(self): + pass + def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True diff --git a/tests/models/speecht5/test_processor_speecht5.py b/tests/models/speecht5/test_processor_speecht5.py index f1e1529b23..75340fa4e0 100644 --- a/tests/models/speecht5/test_processor_speecht5.py +++ b/tests/models/speecht5/test_processor_speecht5.py @@ -21,7 +21,7 @@ import unittest from transformers import is_speech_available, is_torch_available from transformers.models.speecht5 import SpeechT5Tokenizer -from transformers.testing_utils import get_tests_dir, require_torch +from transformers.testing_utils import get_tests_dir, require_speech, require_torch from transformers.utils import FEATURE_EXTRACTOR_NAME @@ -35,6 +35,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe_char.model") @require_torch +@require_speech class SpeechT5ProcessorTest(unittest.TestCase): @classmethod def setUpClass(cls):