diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py index 49d27f7789..ff359fa416 100644 --- a/examples/modular-transformers/configuration_my_new_model.py +++ b/examples/modular-transformers/configuration_my_new_model.py @@ -125,8 +125,6 @@ class MyNewModelConfig(PretrainedConfig): >>> # Accessing the model configuration >>> configuration = model.config ``` - new_param (`int`, *optional*, defaults to `False`): - A fun new parameter """ model_type = "my_new_model" diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py index 15865a2c16..c116b55d4d 100644 --- a/examples/modular-transformers/modeling_new_task_model.py +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -437,32 +437,6 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin): num_logits_to_keep: int = 0, ) -> Union[tuple, NewTaskModelCausalLMOutputWithPast]: r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`. - - Example: - - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, NewTaskModelForNewTask - - >>> model = NewTaskModelForNewTask.from_pretrained("google/new_task_model2-3b-mix-224") - >>> processor = AutoProcessor.from_pretrained("google/new_task_model2-3b-mix-224") - - >>> prompt = "Where is the cat standing?" - >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> inputs = processor(images=image, text=prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(**inputs,) - >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Where is the cat standing?\nsnow" - ``` Returns: """ vlm_outputs = super().forward( diff --git a/examples/modular-transformers/modular_my_new_model.py b/examples/modular-transformers/modular_my_new_model.py index c1ea8b0a72..58b74cd7eb 100644 --- a/examples/modular-transformers/modular_my_new_model.py +++ b/examples/modular-transformers/modular_my_new_model.py @@ -2,11 +2,122 @@ from transformers.models.llama.configuration_llama import LlamaConfig # Example where we only want to only add a new config argument and new arg doc -# here there is no `ARG` so we are gonna take parent doc class MyNewModelConfig(LlamaConfig): r""" - new_param (`int`, *optional*, defaults to `False`): - A fun new parameter + This is the configuration class to store the configuration of a [`MyNewModelModel`]. It is used to instantiate an MyNewModel + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the MyNewModel-7B. + e.g. [meta-my_new_model/MyNewModel-2-7b-hf](https://huggingface.co/meta-my_new_model/MyNewModel-2-7b-hf) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the MyNewModel model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MyNewModelModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. MyNewModel 1 supports up to 2048 tokens, + MyNewModel 2 up to 4096, CodeLlama up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'my_new_model3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'my_new_model3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'my_new_model3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'my_new_model3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + head_dim (`int`, *optional*): + The attention head dimension. If None, it will default to hidden_size // num_attention_heads + + ```python + >>> from transformers import MyNewModelModel, MyNewModelConfig + + >>> # Initializing a MyNewModel my_new_model-7b style configuration + >>> configuration = MyNewModelConfig() + + >>> # Initializing a model from the my_new_model-7b style configuration + >>> model = MyNewModelModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` """ def __init__(self, mlp_bias=True, new_param=0, **super_kwargs): diff --git a/src/transformers/models/d_fine/modeling_d_fine.py b/src/transformers/models/d_fine/modeling_d_fine.py index 068519dbf4..acadb52872 100644 --- a/src/transformers/models/d_fine/modeling_d_fine.py +++ b/src/transformers/models/d_fine/modeling_d_fine.py @@ -1674,21 +1674,7 @@ class DFineForObjectDetection(DFinePreTrainedModel): return_dict: Optional[bool] = None, **kwargs, ) -> Union[tuple[torch.FloatTensor], DFineObjectDetectionOutput]: - r""" - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you - can choose to directly pass a flattened representation of an image. - decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): - Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an - embedded representation. - labels (`list[Dict]` of len `(batch_size,)`, *optional*): - Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the - following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch - respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes - in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. - - Examples: - + """ ```python >>> import torch >>> from transformers.image_utils import load_image @@ -1729,7 +1715,8 @@ class DFineForObjectDetection(DFinePreTrainedModel): Detected cat with confidence 0.956 at location [11.71, 53.52, 316.64, 472.33] Detected remote with confidence 0.947 at location [40.46, 73.7, 175.62, 117.57] Detected sofa with confidence 0.918 at location [0.59, 1.88, 640.25, 474.74] - ```""" + ``` + """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py index 51e3c4512e..4c36cc0f85 100644 --- a/src/transformers/models/diffllama/modeling_diffllama.py +++ b/src/transformers/models/diffllama/modeling_diffllama.py @@ -729,11 +729,6 @@ class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin): **kwargs: Unpack[TransformersKwargs], ) -> CausalLMOutputWithPast: r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - Example: ```python diff --git a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py index adbb34c2fd..eaa2fd313f 100644 --- a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +++ b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py @@ -746,7 +746,7 @@ class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMi output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> BackboneOutput: - r""" + """ Examples: ```python diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index 4d050a0bbb..24a26f3777 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -1292,11 +1292,6 @@ class Emu3ForCausalLM(Emu3PreTrainedModel, GenerationMixin): **kwargs: Unpack[TransformersKwargs], ) -> CausalLMOutputWithPast: r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - Example: ```python diff --git a/src/transformers/models/falcon_h1/modeling_falcon_h1.py b/src/transformers/models/falcon_h1/modeling_falcon_h1.py index 181167a9aa..d156dcc94e 100644 --- a/src/transformers/models/falcon_h1/modeling_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modeling_falcon_h1.py @@ -1530,11 +1530,6 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin): **kwargs, ) -> Union[tuple, CausalLMOutputWithPast]: r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - Example: ```python diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 13a65ae661..85d20faf52 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -479,11 +479,6 @@ class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin): **kwargs: Unpack[TransformersKwargs], ) -> CausalLMOutputWithPast: r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - Example: ```python diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 506fda8f7f..25bdfb7ab8 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -553,11 +553,6 @@ class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin): **kwargs, ) -> CausalLMOutputWithPast: r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - Example: ```python diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index 4560ff8cf3..38b108bc87 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -658,11 +658,6 @@ class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin): **kwargs, ) -> CausalLMOutputWithPast: r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - Example: ```python diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 6fa6140716..1c98bc5000 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -1830,11 +1830,6 @@ class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin): **kwargs, ) -> CausalLMOutputWithPast: r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - Example: ```python diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index 5a0f5d94d8..534f744fc3 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -39,13 +39,13 @@ class Glm4vImagesKwargs(ImagesKwargs): class Glm4vProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Glm4vImagesKwargs - videos_kwargs: Glm4vVideosProcessorKwargs _defaults = { "text_kwargs": { "padding": False, }, } + images_kwargs: Glm4vImagesKwargs + videos_kwargs: Glm4vVideosProcessorKwargs class Glm4vProcessor(ProcessorMixin): diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py index 354e9dbd5f..51fc1f07e9 100644 --- a/src/transformers/models/helium/modeling_helium.py +++ b/src/transformers/models/helium/modeling_helium.py @@ -472,11 +472,6 @@ class HeliumForCausalLM(HeliumPreTrainedModel, GenerationMixin): **kwargs: Unpack[TransformersKwargs], ) -> CausalLMOutputWithPast: r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - Example: ```python diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index 12cbdd7933..30023430ec 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -1501,41 +1501,6 @@ class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple, InstructBlipVideoForConditionalGenerationModelOutput]: r""" - qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided - to serve as text prompt, which the Q-Former model will encode. - - Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for - details. - - [What are input IDs?](../glossary#input-ids) - qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be - provided to serve as text prompt, which the language model can continue. - - Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for - details. - - [What are input IDs?](../glossary#input-ids) - decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): - Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also - be used by default. - - Only relevant in case an encoder-decoder language model (like T5) is used. - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size - - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., - config.vocab_size]` - - Examples: - ```python >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration >>> import torch diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py index e4ce2cc7b7..718300161f 100644 --- a/src/transformers/models/internvl/modeling_internvl.py +++ b/src/transformers/models/internvl/modeling_internvl.py @@ -901,11 +901,6 @@ class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin) **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple, InternVLCausalLMOutputWithPast]: r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - Example: ```python diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py index 23c212d2d3..73d3bf9402 100644 --- a/src/transformers/models/mlcd/modeling_mlcd.py +++ b/src/transformers/models/mlcd/modeling_mlcd.py @@ -223,12 +223,11 @@ def apply_rotary_pos_emb_vision( class MLCDAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper - Multi-headed attention with RoPE. Refer to papers: - - Attention is all you need: - https://huggingface.co/papers/1706.03762 - - RoFormer: Enhanced Transformer with Rotary Position Embedding: - https://huggingface.co/papers/2104.09864 + """Multi-headed attention with RoPE. Refer to papers: + - Attention is all you need: + https://huggingface.co/papers/1706.03762 + - RoFormer: Enhanced Transformer with Rotary Position Embedding: + https://huggingface.co/papers/2104.09864 """ def __init__(self, config: MLCDVisionConfig): diff --git a/src/transformers/models/sam_hq/configuration_sam_hq.py b/src/transformers/models/sam_hq/configuration_sam_hq.py index 1322bc560d..7987510b88 100644 --- a/src/transformers/models/sam_hq/configuration_sam_hq.py +++ b/src/transformers/models/sam_hq/configuration_sam_hq.py @@ -221,8 +221,6 @@ class SamHQMaskDecoderConfig(PretrainedConfig): The dimensionality of the hidden states in the IoU head module. layer_norm_eps (`float`, *optional*, defaults to 1e-06): The epsilon used by the layer normalization layers. - - vit_dim (`int`, *optional*, defaults to 768): Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module. """ diff --git a/src/transformers/models/sam_hq/modular_sam_hq.py b/src/transformers/models/sam_hq/modular_sam_hq.py index 5dc501dc80..67772cb6c4 100644 --- a/src/transformers/models/sam_hq/modular_sam_hq.py +++ b/src/transformers/models/sam_hq/modular_sam_hq.py @@ -77,8 +77,37 @@ class SamHQVisionConfig(SamVisionConfig): class SamHQMaskDecoderConfig(SamMaskDecoderConfig): r""" - vit_dim (`int`, *optional*, defaults to 768): - Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module. + This is the configuration class to store the configuration of a [`SamHQMaskDecoder`]. It is used to instantiate a SAM_HQ + mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults + will yield a similar configuration to that of the SAM_HQ-vit-h + [facebook/sam_hq-vit-huge](https://huggingface.co/facebook/sam_hq-vit-huge) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the hidden states. + hidden_act (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function used inside the `SamHQMaskDecoder` module. + mlp_dim (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 2): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + attention_downsample_rate (`int`, *optional*, defaults to 2): + The downsampling rate of the attention layer. + num_multimask_outputs (`int`, *optional*, defaults to 3): + The number of outputs from the `SamHQMaskDecoder` module. In the Segment Anything paper, this is set to 3. + iou_head_depth (`int`, *optional*, defaults to 3): + The number of layers in the IoU head module. + iou_head_hidden_dim (`int`, *optional*, defaults to 256): + The dimensionality of the hidden states in the IoU head module. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + vit_dim (`int`, *optional*, defaults to 768): + Dimensionality of the Vision Transformer (ViT) used in the `SamHQMaskDecoder` module. """ def __init__( diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py index f6a8b6ac46..bbe5a98154 100644 --- a/src/transformers/models/smolvlm/modeling_smolvlm.py +++ b/src/transformers/models/smolvlm/modeling_smolvlm.py @@ -874,16 +874,6 @@ class SmolVLMForConditionalGeneration(SmolVLMPreTrainedModel, GenerationMixin): **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple, SmolVLMCausalLMOutputWithPast]: r""" - pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): - Mask to avoid performing attention on padding pixel indices. - image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): - The hidden states of the image encoder after modality projection. - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `SmolVLMForConditionalGeneration`). - Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only - computed for the tokens with labels in `[0, ..., config.vocab_size]`. - Example: ```python diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index 2280848735..ff51537d81 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -26,81 +26,80 @@ from ...configuration_utils import PretrainedConfig, layer_type_validation class T5GemmaModuleConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the T5GemmaModule-7B. - e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b) - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - Args: - vocab_size (`int`, *optional*, defaults to 256000): - Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`T5GemmaModuleModel`] - hidden_size (`int`, *optional*, defaults to 2304): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 9216): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 26): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 8): - Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*, defaults to 4): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details, check out [this - paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to - `num_attention_heads`. - head_dim (`int`, *optional*, defaults to 256): - The attention head dimension. - hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): - The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"` - if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function. - max_position_embeddings (`int`, *optional*, defaults to 8192): - The maximum sequence length that this model might ever be used with. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*, defaults to 0): - Padding token id. - eos_token_id (`int`, *optional*, defaults to 1): - End of stream token id. - bos_token_id (`int`, *optional*, defaults to 2): - Beginning of stream token id. - tie_word_embeddings (`bool`, *optional*, defaults to `True`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value and output projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - query_pre_attn_scalar (`float`, *optional*, defaults to 256): - scaling factor used on the attention scores - sliding_window (`int`, *optional*, defaults to 4096): - in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window. - layer_types (`list`, *optional*): - Attention pattern for each layer. - final_logit_softcapping (`float`, *optional*, defaults to 30.0): - scaling factor when applying tanh softcapping on the logits. - attn_logit_softcapping (`float`, *optional*, defaults to 50.0): - scaling factor when applying tanh softcapping on the attention scores. + This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the T5GemmaModule-7B. + e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b) + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 256000): + Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`T5GemmaModuleModel`] + hidden_size (`int`, *optional*, defaults to 2304): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 9216): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 26): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 4): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + head_dim (`int`, *optional*, defaults to 256): + The attention head dimension. + hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"` + if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function. + max_position_embeddings (`int`, *optional*, defaults to 8192): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + eos_token_id (`int`, *optional*, defaults to 1): + End of stream token id. + bos_token_id (`int`, *optional*, defaults to 2): + Beginning of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + query_pre_attn_scalar (`float`, *optional*, defaults to 256): + scaling factor used on the attention scores + sliding_window (`int`, *optional*, defaults to 4096): + in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window. + layer_types (`list`, *optional*): + Attention pattern for each layer. + final_logit_softcapping (`float`, *optional*, defaults to 30.0): + scaling factor when applying tanh softcapping on the logits. + attn_logit_softcapping (`float`, *optional*, defaults to 50.0): + scaling factor when applying tanh softcapping on the attention scores. - ```python - >>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig - >>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration - >>> configuration = T5GemmaModuleConfig() - >>> # Initializing a model from the t5_gemma_module-7b style configuration - >>> model = T5GemmaModuleModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - ``` - Module config (encoder or decoder): the same as Gemma2Config.""" + ```python + >>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig + >>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration + >>> configuration = T5GemmaModuleConfig() + >>> # Initializing a model from the t5_gemma_module-7b style configuration + >>> model = T5GemmaModuleModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" model_type = "t5_gemma_module" keys_to_ignore_at_inference = ["past_key_values"] diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index 603d485359..56533855ee 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -68,10 +68,7 @@ logger = logging.get_logger(__name__) class T5GemmaModuleConfig(Gemma2Config): - """Module config (encoder or decoder): the same as Gemma2Config.""" - - def __init__(self, **super_kwargs): - super().__init__(**super_kwargs) + pass class T5GemmaConfig(PretrainedConfig): diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py index 13df73c09e..c565808363 100644 --- a/src/transformers/models/zamba2/modeling_zamba2.py +++ b/src/transformers/models/zamba2/modeling_zamba2.py @@ -319,17 +319,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): class Zamba2Attention(nn.Module): """ - Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer - and "Generating Long Sequences with Sparse Transformers". - - Adapted from transformers.models.mistral.modeling_mistral.MistralAttention: - The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads. - The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer - (see fig. 2 in https://huggingface.co/papers/2405.16712). - Additionally, replaced - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2) - Multi-headed attention from 'Attention Is All You Need' paper. Adapted from transformers.models.mistral.modeling_mistral.MistralAttention: diff --git a/tests/repo_utils/modular/test_conversion_order.py b/tests/repo_utils/modular/test_conversion_order.py index 7f44684b7b..265873c604 100644 --- a/tests/repo_utils/modular/test_conversion_order.py +++ b/tests/repo_utils/modular/test_conversion_order.py @@ -23,7 +23,6 @@ FILES_TO_PARSE = [ os.path.join(MODEL_ROOT, "rt_detr", "modular_rt_detr.py"), os.path.join(MODEL_ROOT, "qwen2", "modular_qwen2.py"), os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3.py"), - os.path.join(MODEL_ROOT, "qwen3", "modular_qwen3_moe.py"), os.path.join(MODEL_ROOT, "llava_next_video", "modular_llava_next_video.py"), os.path.join(MODEL_ROOT, "cohere2", "modular_cohere2.py"), os.path.join(MODEL_ROOT, "modernbert", "modular_modernbert.py"), diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index 187fb60afb..7585695e65 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -249,90 +249,13 @@ class ReplaceMethodCallTransformer(cst.CSTTransformer): return updated_node -def get_docstring_indent(docstring): - # Match the first line after the opening triple quotes - match = re.search(r'(?:"""|\'\'\'|```)\n(\s+)', docstring) - if match: - # Return the indentation spaces captured - return len(match.group(1)) - return 0 - - -def is_full_docstring(original_docstring: str, new_docstring: str, original_level: int) -> bool: - """Check if `new_docstring` is a full docstring, or if it is only part of a docstring that should then - be merged with the existing old one. - """ - # libcst returns the docstrinbgs with literal `r"""` quotes in front - new_docstring = new_docstring.split('"""', 1)[1] - # The docstring contains Args definition, so it is self-contained - if re.search(r"\n\s*Args:\n", new_docstring): - return True - elif re.search(r"\n\s*Args:\n", original_docstring): - return False - # Check if the docstring contains args docstring (meaning it is self contained): - param_pattern = re.compile( - # |--- Group 1 ---|| Group 2 ||- Group 3 -||---------- Group 4 ----------| - rf"^\s{{0,{original_level}}}(\w+)\s*\(\s*([^, \)]*)(\s*.*?)\s*\)\s*:\s*((?:(?!\n^\s{{0,{original_level}}}\w+\s*\().)*)", - re.DOTALL | re.MULTILINE, - ) - match_object = param_pattern.search(new_docstring) - if match_object is not None: - return True - # If it contains Returns, but starts with text indented with an additional 4 spaces before, it is self-contained - # (this is the scenario when using `@add_start_docstrings_to_model_forward`, but adding more args to docstring) - match_object = re.search(r"\n([^\S\n]*)Returns:\n", new_docstring) - if match_object is not None: - full_indent = match_object.group(1) - striped_doc = new_docstring.strip("\n") - if striped_doc.startswith(full_indent + " " * 4) or striped_doc.startswith(full_indent + "\t"): - return True - return False - - -def merge_docstrings(original_docstring, updated_docstring): - original_level = get_docstring_indent(original_docstring) - if not is_full_docstring(original_docstring, updated_docstring, original_level): - # Split the docstring at the example section, assuming `"""` is used to define the docstring - parts = original_docstring.split("```") - if "```" in updated_docstring and len(parts) > 1: - updated_docstring = updated_docstring.lstrip('r"') - new_parts = updated_docstring.split("```") - if len(new_parts) != 3: - raise ValueError("There should only be one example, and it should have opening and closing '```'") - parts[1] = new_parts[1] - updated_docstring = "".join( - [ - f"\n{original_level * ' '}```", - parts[1], - "```", - parts[2], - ] - ) - docstring_opening, original_start_docstring = parts[0].rstrip(" \n").split('"""')[:2] - new_start_docstring = new_parts[0].rstrip(" \n") - docstring_opening += '"""' - if new_start_docstring.startswith(original_start_docstring): - updated_docstring = new_start_docstring + "\n" + updated_docstring - elif original_start_docstring.endswith(new_start_docstring): - updated_docstring = original_start_docstring + "\n" + updated_docstring - else: - updated_docstring = original_start_docstring + "\n" + new_start_docstring + "\n" + updated_docstring - updated_docstring = docstring_opening + updated_docstring - elif updated_docstring not in original_docstring: - # add tabulation if we are at the lowest level. - if re.search(r"\n\s*.*\(.*\)\:\n\s*\w", updated_docstring): - updated_docstring = updated_docstring.replace("\n ", "\n ") - updated_docstring = original_docstring.rstrip('"') + "\n" + updated_docstring.lstrip('r"\n') - return updated_docstring - - class SuperTransformer(cst.CSTTransformer): METADATA_DEPENDENCIES = (ParentNodeProvider,) - def __init__(self, python_module: cst.Module, original_methods, updated_methods, all_bases=None): + def __init__(self, python_module: cst.Module, original_modeling_methods, modular_methods, all_bases=None): self.python_module = python_module - self.original_methods = original_methods - self.updated_methods = updated_methods + self.original_modeling_methods = original_modeling_methods + self.modular_methods = modular_methods self.all_assign_target = {} self.deleted_targets = {} # child node can delete some arguments self.all_bases = all_bases or [] @@ -414,53 +337,39 @@ class SuperTransformer(cst.CSTTransformer): break return new_body - def replace_super_calls(self, node: cst.IndentedBlock, func_name: str) -> cst.CSTNode: + def replace_super_calls(self, node: cst.BaseSuite, func_name: str) -> cst.BaseSuite: """Updates the body of the input `node`'s `func_name` function by replacing calls to super().func_name() with the source code of the parent class' `func_name`. It keeps everything that is defined before `super().func_name()`. """ - self.has_docstring = False - parent_has_docstring = False - if func_name in self.original_methods: - parent_has_docstring = m.matches(self.original_methods[func_name].body.body[0], DOCSTRING_NODE) new_body = [] - has_super_call = False + modular_node_body = node.body - for i, expr in enumerate(node.body): + for i, expr in enumerate(modular_node_body): if is_call_to_super(expr, func_name): - has_super_call = True - new_body.extend(self.update_body(self.original_methods[func_name].body.body, node.body[i + 1 :])) + original_modeling_method_body = self.original_modeling_methods[func_name].body.body + new_body.extend(self.update_body(original_modeling_method_body, modular_node_body[i + 1 :])) new_body = self._fix_init_location(new_body) + return node.with_changes(body=new_body) else: expr = expr.visit(self.transformer) - if m.matches(expr, DOCSTRING_NODE): - self.has_docstring = True - if parent_has_docstring: # actually here we ought to de-duplicate? - original_docstring = self.original_methods[func_name].body.body[0].body[0].value.value - updated_docstring = expr.body[0].value.value - merged_doc = merge_docstrings(original_docstring, updated_docstring) - new_node = [expr.with_changes(body=[cst.Expr(value=cst.SimpleString(value=merged_doc))])] - else: - new_node = [expr] - new_body.extend(new_node) - elif not m.matches(expr, m.SimpleStatementLine(body=[m.Del()])) and not has_super_call: + if not m.matches(expr, m.SimpleStatementLine(body=[m.Del()])): new_body.append(expr) - if not self.has_docstring and parent_has_docstring: - new_body = [self.original_methods[func_name].body.body[0]] + new_body + return node.with_changes(body=new_body) - def leave_FunctionDef(self, original_node: cst.Call, updated_node: cst.Call) -> cst.CSTNode: - if updated_node.name.value in self.updated_methods: - name = updated_node.name.value + def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.FunctionDef) -> cst.FunctionDef: + name = updated_node.name.value + if name in self.modular_methods: new_body = self.replace_super_calls(updated_node.body, name) return updated_node.with_changes(body=new_body, params=updated_node.params) return updated_node - def leave_Return(self, original_node: cst.Return, updated_node: cst.Return) -> cst.CSTNode: + def leave_Return(self, original_node: cst.Return, updated_node: cst.Return) -> cst.Return: """ "When a return statement is reached, it is replaced with the unrolled super code""" if m.matches(updated_node.value, m.Call(func=m.Attribute(attr=m.Name("super")))): func_def = self.get_metadata(ParentNodeProvider, original_node) - if m.matched(func_def, m.FunctionDef()) and func_def.name.value in self.original_methods: + if m.matched(func_def, m.FunctionDef()) and func_def.name.value in self.original_modeling_methods: updated_return_value = updated_node.value.with_changes( args=[ cst.Arg( @@ -979,55 +888,52 @@ def common_partial_suffix(str1: str, str2: str) -> str: def replace_class_node( - mapper: ModelFileMapper, class_node: cst.ClassDef, renamed_super_class: str, original_super_class: str -): + mapper: ModelFileMapper, modular_class_node: cst.ClassDef, renamed_super_class: str, original_super_class: str +) -> cst.ClassDef: """ Replace a class node which inherits from another modeling class. This function works in the following way: - - start from the base class node of the inherited class (a cst.Node) - - replace all methods of the base node with the methods defined in the child class - - append all new methods defined in the child class + - start from the methods and class attributes of the original modeling code node, and replace their definition + if overriden in the modular + - append all new methods and class attributes defined in the child class + - all potential method/class docstrings and decorators use the ones found in modular if any, else in original modeling - replace all calls to super() with the unravelled code - | ```python | | ```python - | class GemmaModel(LlamaModel): | | class GemmaModel(nn.Module): - | def __init__(self): | | def __init__(self): - Going from: | super().__init__() | to: | super().__init__(config) - | self.dropout = 0.2 | | self.dropout = 0.2 - | ``` | | self.padding_idx = config.pad_token_id - | self.vocab_size = config.vocab_size - | self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - | self.layers = nn.ModuleList( - | [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - | ) - | self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - | self.gradient_checkpointing = False - | # Initialize weights and apply final processing - | self.post_init() - | ``` - """ - all_bases = [get_full_attribute_name(k.value) for k in class_node.bases] - if any(base is None for base in all_bases): - raise ValueError(f"Could not parse the name of the bases for {class_node.name.value}") + Args: + mapper (`ModelFileMapper`): + The mapper corresponding to the visited file from which the modular class node inherits. + modular_class_node (`cst.ClassDef`): + The class node as found in the modular file. + renamed_super_class (`str`): + The name of the class from which `modular_class_node` inherits after automatic renaming. + original_super_class (`str`): + The name of the class from which `modular_class_node` inherits before automatic renaming. - original_node = mapper.classes[renamed_super_class] + Returns: + A new class node corresponding to the modular definition. + """ + all_bases = [get_full_attribute_name(k.value) for k in modular_class_node.bases] + if any(base is None for base in all_bases): + raise ValueError(f"Could not parse the name of the bases for {modular_class_node.name.value}") + + original_modeling_node = mapper.classes[renamed_super_class] # Always use the new name of the class (in case we use e.g. `ColPaliForRetrieval` inheriting from `PaliGemmaForConditionalGeneration`) - new_name = class_node.name + new_class_name = modular_class_node.name # If the new class name is different from the renamed super class name, we need to update the docstrings/comments accordingly - if new_name.value != renamed_super_class: - common_suffix = common_partial_suffix(new_name.value, renamed_super_class) + if new_class_name.value != renamed_super_class: + common_suffix = common_partial_suffix(new_class_name.value, renamed_super_class) # Note that this works even without common prefix, in which case it does not replace anything - old, new = renamed_super_class.replace(common_suffix, ""), new_name.value.replace(common_suffix, "") - temp_module = cst.Module(body=[original_node]) - original_node = temp_module.visit( + old, new = renamed_super_class.replace(common_suffix, ""), new_class_name.value.replace(common_suffix, "") + temp_module = cst.Module(body=[original_modeling_node]) + original_modeling_node = temp_module.visit( ReplaceNameTransformer(get_lowercase_name(old), get_lowercase_name(new), only_doc=True) ).body[0] # If we explicitly passed a new base with common suffix to an old base, it is for switching the prefix # e.g. if the "natural" parent class is `PreTrainedModel` but we wanted to rename it to `PreTrainedVisionModel` additional_bases = [base for base in all_bases if base != original_super_class] - new_bases = [] - for original_base in original_node.bases: + new_class_bases = [] + for original_base in original_modeling_node.bases: new_base = original_base # we only potentially switch base for Name-based bases, not Attribute if m.matches(original_base.value, m.Name()): @@ -1038,106 +944,125 @@ def replace_class_node( new_name_node = original_base.value.with_changes(value=additional_base_name) new_base = original_base.with_changes(value=new_name_node) break - new_bases.append(new_base) + new_class_bases.append(new_base) - original_methods = { - f.name.value if hasattr(f, "name") else mapper.python_module.code_for_node(f): f - for f in original_node.body.body + # Use class decorators redefined in modular file if any + new_class_decorators = ( + modular_class_node.decorators if len(modular_class_node.decorators) > 0 else original_modeling_node.decorators + ) + + # Compute new class docstring + original_modeling_docstring = [ + node for node in original_modeling_node.body.body if m.matches(node, DOCSTRING_NODE) + ] + modular_docstring = [node for node in modular_class_node.body.body if m.matches(node, DOCSTRING_NODE)] + # Use class docstring in modular if any, else original modeling code docstring + new_class_docstring = modular_docstring if len(modular_docstring) > 0 else original_modeling_docstring + + # Compute new class attributes + original_modeling_class_attributes = { + node.body[0].targets[0].target.value: node + for node in original_modeling_node.body.body + if m.matches(node, m.SimpleStatementLine(body=[m.Assign()])) } - updated_methods = { - f.name.value if hasattr(f, "name") else mapper.python_module.code_for_node(f): f for f in class_node.body.body + original_modeling_class_attributes.update( + { + node.body[0].target.value: node + for node in original_modeling_node.body.body + if m.matches(node, m.SimpleStatementLine(body=[m.AnnAssign()])) + } + ) + modular_class_attributes = { + node.body[0].targets[0].target.value: node + for node in modular_class_node.body.body + if m.matches(node, m.SimpleStatementLine(body=[m.Assign()])) } - end_meth = [] + modular_class_attributes.update( + { + node.body[0].target.value: node + for node in modular_class_node.body.body + if m.matches(node, m.SimpleStatementLine(body=[m.AnnAssign()])) + } + ) + # Use all original modeling attributes, and potentially override some with values in the modular + new_class_attributes = list({**original_modeling_class_attributes, **modular_class_attributes}.values()) - assign_targets = {} - docstring_node = [] - # Iterate directly from node.body as there can be property/setters with same names which are overwritten when we use a dict - for func in original_node.body.body: - name = func.name.value if hasattr(func, "name") else mapper.python_module.code_for_node(func) - if m.matches(func, m.FunctionDef()) and name in updated_methods and updated_methods[name] is not None: - new_params = updated_methods[name].params - # Replace the method in the replacement class, preserving decorators - kwarg_name = getattr(updated_methods[name].params, "star_kwarg", None) - if kwarg_name and kwarg_name.name.value == "super_kwargs": - parent_params = {k.name.value: k for k in func.params.params} - parent_params.update({k.name.value: k for k in new_params.params[1:]}) - new_params = new_params.with_changes( - params=list(parent_params.values()), star_kwarg=func.params.star_kwarg - ) - # Keep decorators in `modular_xxx.py` if any, else original decorators - new_decorators = ( - updated_methods[name].decorators if len(updated_methods[name].decorators) > 0 else func.decorators - ) + original_modeling_methods = { + node.name.value: node for node in original_modeling_node.body.body if m.matches(node, m.FunctionDef()) + } + modular_methods = { + node.name.value: node for node in modular_class_node.body.body if m.matches(node, m.FunctionDef()) + } - # Keep return annotation in `modular_xxx.py` if any, else original return annotation - new_return_annotation = updated_methods[name].returns if updated_methods[name].returns else func.returns + new_class_methods = [] + # Iterate over the methods of the original modeling code, and add them to the list of methods to add + for name, node in original_modeling_methods.items(): + # If the method was redefined in modular, make appropriate changes to the node + if name in modular_methods: + # Get the corresponding method node in modular + modular_node = modular_methods[name] - if not re.match( - r"\ndef .*\(.*\):\n raise.*Error\(.*", - mapper.python_module.code_for_node(updated_methods[name]), - ): - func = func.with_changes( - body=updated_methods[name].body, - params=new_params, - decorators=new_decorators, - returns=new_return_annotation, - ) - else: + # If we match the pattern, we should avoid inheriting the method + if re.match(r"\ndef .*\(.*\):\n raise.*Error\(.*", mapper.python_module.code_for_node(modular_node)): continue - if m.matches(func, m.SimpleStatementLine(body=[m.Assign()])): - target = mapper.python_module.code_for_node(func.body[0].targets[0]) - assign_targets[target] = func - elif m.matches(func, m.SimpleStatementLine(body=[m.AnnAssign()])): - target = mapper.python_module.code_for_node(func.body[0].target) - assign_targets[target] = func - elif m.matches(func, DOCSTRING_NODE): - docstring_node = [func] - else: - end_meth.append(func) + # Compute new method docstring + modeling_docstring = [node_ for node_ in node.body.body if m.matches(node_, DOCSTRING_NODE)] + modular_docstring = [node_ for node_ in modular_node.body.body if m.matches(node_, DOCSTRING_NODE)] + # Use method docstring in modular if any, else original modeling code docstring + new_body = ( + modular_node.body.body + if len(modular_docstring) > 0 + else modeling_docstring + list(modular_node.body.body) + ) + new_body = modular_node.body.with_changes(body=new_body) + + # Use arguments as defined in the modular + new_params = modular_node.params + + # If using the `**super_kwargs` syntax in modular, merge any existing modular arg with all the original modeling ones + kwarg_name = getattr(modular_node.params, "star_kwarg", None) + if kwarg_name and kwarg_name.name.value == "super_kwargs": + original_modeling_params = {k.name.value: k for k in node.params.params} + modular_params = {k.name.value: k for k in new_params.params[1:]} + new_param_list = list({**original_modeling_params, **modular_params}.values()) + new_params = new_params.with_changes(params=new_param_list, star_kwarg=node.params.star_kwarg) + + # Keep decorators in modular if any, else original decorators + new_decorators = modular_node.decorators if len(modular_node.decorators) > 0 else node.decorators + + # Keep return annotation in modular if any, else original return annotation + new_return_annotation = modular_node.returns if modular_node.returns else node.returns + + # Update the method node + node = node.with_changes( + body=new_body, + params=new_params, + decorators=new_decorators, + returns=new_return_annotation, + ) + + new_class_methods.append(node) # Port new methods that are defined only in modular-file and append at the end - for func in class_node.body.body: - name = func.name.value if hasattr(func, "name") else mapper.python_module.code_for_node(func) - if m.matches(func, DOCSTRING_NODE): # This processes the docstring of the class! - # Extract the original docstring - updated_docstring = func.body[0].value.value - if len(docstring_node) == 0: # If the original docstring is empty, just create one from the updated. - docstring_node = [ - cst.SimpleStatementLine(body=[cst.Expr(value=cst.SimpleString(value=updated_docstring))]) - ] - else: - original_docstring = docstring_node[0].body[0].value.value - merged_doc = merge_docstrings(original_docstring, updated_docstring) - # Update the docstring in the original function - docstring_node = [ - docstring_node[0].with_changes(body=[cst.Expr(value=cst.SimpleString(value=merged_doc))]) - ] - if name not in original_methods and func is not None and isinstance(func, cst.FunctionDef): - end_meth.append(func) - if m.matches(func, m.SimpleStatementLine(body=[m.Assign()])): - # TODO we only use single assign might cause issues - target = mapper.python_module.code_for_node(func.body[0].targets[0]) - assign_targets[target] = func - if m.matches(func, m.SimpleStatementLine(body=[m.AnnAssign()])): - target = mapper.python_module.code_for_node(func.body[0].target) - assign_targets[target] = func - end_meth = docstring_node + list(assign_targets.values()) + end_meth + for name, node in modular_methods.items(): + if name not in original_modeling_methods: + new_class_methods.append(node) - # Replace the calls to `super()` with the unrolled code - result_node = original_node.with_changes(body=cst.IndentedBlock(body=end_meth)) + # Recreate the whole new class body + new_class_body = new_class_docstring + new_class_attributes + new_class_methods + + # Replace the calls to `super()` of the redefined modular methods with the unrolled code + result_node = original_modeling_node.with_changes(body=cst.IndentedBlock(body=new_class_body)) temp_module = cst.Module(body=[result_node]) new_module = MetadataWrapper(temp_module) new_replacement_class = new_module.visit( - SuperTransformer(temp_module, original_methods, updated_methods, all_bases) + SuperTransformer(temp_module, original_modeling_methods, modular_methods, all_bases) ) - new_replacement_body = new_replacement_class.body[0].body # get the indented block + new_class_body = new_replacement_class.body[0].body # get the indented block - # Use decorators redefined in `modular_xxx.py` if any - new_decorators = class_node.decorators if len(class_node.decorators) > 0 else original_node.decorators - - return original_node.with_changes( - body=new_replacement_body, decorators=new_decorators, bases=new_bases, name=new_name + return original_modeling_node.with_changes( + body=new_class_body, decorators=new_class_decorators, bases=new_class_bases, name=new_class_name )