From 7ef592c96cbf616492ac4181a390d465189abec6 Mon Sep 17 00:00:00 2001
From: Biao Zhang <17406686+bzhangGo@users.noreply.github.com>
Date: Tue, 8 Jul 2025 13:08:48 -0400
Subject: [PATCH] Update T5gemma (#39210)

* bug fix: add vocab_size to t5gemmaconfig for pipeline.

* Update checkpoint placeholder

* minor change

* minor change

* minor change: update example.

* fix: add vocab_size as an explict arg.

* buf fix:

remove vocab_size verification; instead, re-set encoder/decoder vocab size.

Note, in t5gemma, vocab size of encoder/decoder shoud be always the same.

* add `add_generation_prompt` for message preprocessing.
---
 docs/source/en/model_doc/t5gemma.md           | 46 +++++++++++++------
 .../models/t5gemma/configuration_t5gemma.py   | 11 ++++-
 .../models/t5gemma/modular_t5gemma.py         | 14 ++++--
 3 files changed, 51 insertions(+), 20 deletions(-)
diff --git a/docs/source/en/model_doc/t5gemma.md b/docs/source/en/model_doc/t5gemma.md
index d8615a9add..72140b21d3 100644
--- a/docs/source/en/model_doc/t5gemma.md
+++ b/docs/source/en/model_doc/t5gemma.md
@@ -14,7 +14,13 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
-
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
 
 # T5Gemma
 
@@ -24,6 +30,9 @@ T5Gemma has two groups of model sizes: 1) [Gemma 2](https://ai.google.dev/gemma/
 
 The pretrained varaints are trained with two objectives: prefix language modeling with knowledge distillation (PrefixLM) and UL2, separately. We release both variants for each model size. The instruction-turned varaints was post-trained with supervised fine-tuning and reinforcement learning.
 
+> [!TIP]
+> Click on the T5Gemma models in the right sidebar for more examples of how to apply T5Gemma to different language tasks.
+
 The example below demonstrates how to chat with the model with [`Pipeline`] or the [`AutoModel`] class, and from the command line.
 
 <hfoptions id="usage">
@@ -35,43 +44,52 @@ import torch
 from transformers import pipeline
 
 pipe = pipeline(
-    task="text2text-generation",
-    model="google/t5gemma-placeholder",
+    "text2text-generation",
+    model="google/t5gemma-2b-2b-prefixlm-it",
     torch_dtype=torch.bfloat16,
-    device="cuda",
+    device="cuda",  # replace with "mps" to run on a Mac device
 )
 
-pipe("Question: Why is the sky blue?\nAnswer:", max_new_tokens=50)
+messages = [
+    {"role": "user", "content": "Tell me an unknown interesting biology fact about the brain."},
+]
+prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+pipe(prompt, max_new_tokens=32)
 ```
 
 </hfoption>
 <hfoption id="AutoModel">
 
 ```python
-import torch
+# pip install accelerate
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
 
-tokenizer = AutoTokenizer.from_pretrained("google/t5gemma-placeholder")
+tokenizer = AutoTokenizer.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
 model = AutoModelForSeq2SeqLM.from_pretrained(
-    "google/t5gemma-placeholder",
+    "google/t5gemma-2b-2b-prefixlm-it",
+    device_map="auto",
     torch_dtype=torch.bfloat16,
-    device_map="auto"
 )
 
-input_text = "Question: Why is the sky blue?\nAnswer:"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+messages = [
+    {"role": "user", "content": "Tell me an unknown interesting biology fact about the brain."},
+]
+input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True, add_generation_prompt=True).to("cuda")
 
 outputs = model.generate(**input_ids, max_new_tokens=32)
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-
+print(tokenizer.decode(outputs[0]))
 ```
 
 </hfoption>
 <hfoption id="transformers CLI">
 
 ```
-echo -e "Question: Why is the sky blue? Answer:" | transformers run --task text2text-generation --model google/t5gemma-placeholder --device 0
+echo -e "Write me a poem about Machine Learning. Answer:" | transformers run --task text2text-generation --model google/t5gemma-2b-2b-prefixlm --device 0
 ```
+</hfoption>
+</hfoptions>
 
 ## T5GemmaConfig
 
diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py
index ff51537d81..bc195d562f 100644
--- a/src/transformers/models/t5gemma/configuration_t5gemma.py
+++ b/src/transformers/models/t5gemma/configuration_t5gemma.py
@@ -186,10 +186,10 @@ class T5GemmaConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`T5GemmaModel`]. It is used to instantiate an T5Gemma
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to a hypothetical balanced Gemma2 encoder-decoder model.
-    e.g. [google/t5gemma-placeholder](https://huggingface.co/google/t5gemma-placeholder)
+    e.g. [google/t5gemma-2b-2b-prefixlm-it](https://huggingface.co/google/t5gemma-2b-2b-prefixlm-it)
     ```python
     >>> from transformers import T5GemmaConfig, T5GemmaModel
-    >>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-placeholder")
+    >>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
     >>> model = T5GemmaModel(t5gemma_config)
     ```
     Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the
@@ -209,6 +209,8 @@ class T5GemmaConfig(PretrainedConfig):
             The dropout ratio for attention.
         tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether tie input and output embeddings.
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the T5Gemma model (the same as Gemma 2).
         kwargs (additional keyword arguments, optional, *optional*):
             Will be passed to the PretrainedConfig base class.
     """
@@ -257,6 +259,7 @@ class T5GemmaConfig(PretrainedConfig):
         classifier_dropout_rate: float = 0.0,
         attention_dropout: float = 0.0,
         tie_word_embeddings: bool = True,
+        vocab_size: int = 256000,
         **kwargs,
     ):
         if isinstance(encoder, dict):
@@ -302,6 +305,9 @@ class T5GemmaConfig(PretrainedConfig):
         self.classifier_dropout_rate = classifier_dropout_rate
         self.tie_word_embeddings = tie_word_embeddings
 
+        # Used in pipeline generation.
+        self.vocab_size = vocab_size
+
     def __setattr__(self, key, value):
         shared_attr_with_submodules = [
             "output_hidden_states",
@@ -309,6 +315,7 @@ class T5GemmaConfig(PretrainedConfig):
             "_attn_implementation",
             "dropout_rate",
             "attention_dropout",
+            "vocab_size",
         ]
 
         if key in shared_attr_with_submodules:
diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py
index 56533855ee..9360008e30 100644
--- a/src/transformers/models/t5gemma/modular_t5gemma.py
+++ b/src/transformers/models/t5gemma/modular_t5gemma.py
@@ -56,8 +56,7 @@ from ..gemma2.modeling_gemma2 import (
 )
 
 
-# TODO(bzhanggo): figure out these documentations
-_CHECKPOINT_FOR_DOC = "google/t5gemma-placeholder"
+_CHECKPOINT_FOR_DOC = "google/t5gemma-2b-2b-prefixlm-it"
 
 
 if is_torch_flex_attn_available():
@@ -76,10 +75,10 @@ class T5GemmaConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`T5GemmaModel`]. It is used to instantiate an T5Gemma
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to a hypothetical balanced Gemma2 encoder-decoder model.
-    e.g. [google/t5gemma-placeholder](https://huggingface.co/google/t5gemma-placeholder)
+    e.g. [google/t5gemma-2b-2b-prefixlm-it](https://huggingface.co/google/t5gemma-2b-2b-prefixlm-it)
     ```python
     >>> from transformers import T5GemmaConfig, T5GemmaModel
-    >>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-placeholder")
+    >>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
     >>> model = T5GemmaModel(t5gemma_config)
     ```
     Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the
@@ -99,6 +98,8 @@ class T5GemmaConfig(PretrainedConfig):
             The dropout ratio for attention.
         tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether tie input and output embeddings.
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the T5Gemma model (the same as Gemma 2).
         kwargs (additional keyword arguments, optional, *optional*):
             Will be passed to the PretrainedConfig base class.
     """
@@ -147,6 +148,7 @@ class T5GemmaConfig(PretrainedConfig):
         classifier_dropout_rate: float = 0.0,
         attention_dropout: float = 0.0,
         tie_word_embeddings: bool = True,
+        vocab_size: int = 256000,
         **kwargs,
     ):
         if isinstance(encoder, dict):
@@ -192,6 +194,9 @@ class T5GemmaConfig(PretrainedConfig):
         self.classifier_dropout_rate = classifier_dropout_rate
         self.tie_word_embeddings = tie_word_embeddings
 
+        # Used in pipeline generation.
+        self.vocab_size = vocab_size
+
     def __setattr__(self, key, value):
         shared_attr_with_submodules = [
             "output_hidden_states",
@@ -199,6 +204,7 @@ class T5GemmaConfig(PretrainedConfig):
             "_attn_implementation",
             "dropout_rate",
             "attention_dropout",
+            "vocab_size",
         ]
 
         if key in shared_attr_with_submodules: