[modular] Simplify logic and docstring handling (#39185)

* simplify a lot

* Update modular_model_converter.py

* finalize

* remove outdated functions

* apply it

* and examples
This commit is contained in:
Cyril Vallez
2025-07-07 14:52:57 +02:00
committed by GitHub
parent f16fbfb89a
commit 056fa73fae
25 changed files with 380 additions and 465 deletions

View File

@@ -437,32 +437,6 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
num_logits_to_keep: int = 0,
) -> Union[tuple, NewTaskModelCausalLMOutputWithPast]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
Example:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, NewTaskModelForNewTask
>>> model = NewTaskModelForNewTask.from_pretrained("google/new_task_model2-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/new_task_model2-3b-mix-224")
>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, text=prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```
Returns:
"""
vlm_outputs = super().forward(