From ed9f252608389c4e5cb2c5a94f5cc47d76855842 Mon Sep 17 00:00:00 2001 From: Ryan Mullins Date: Mon, 30 Jun 2025 08:10:51 -0400 Subject: [PATCH] docs: Gemma 3n audio encoder (#39087) Updating Gemma 3n docs and docstrings to clarify the relationship between the newly trained audio encoder used in Gemma 3n and the USM model from the original paper. --- docs/source/en/model_doc/gemma3n.md | 4 ++-- .../models/gemma3n/configuration_gemma3n.py | 8 ++++---- src/transformers/models/gemma3n/modeling_gemma3n.py | 2 +- src/transformers/models/gemma3n/modular_gemma3n.py | 10 +++++----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/source/en/model_doc/gemma3n.md b/docs/source/en/model_doc/gemma3n.md index 7f38c3b18c..d38368e829 100644 --- a/docs/source/en/model_doc/gemma3n.md +++ b/docs/source/en/model_doc/gemma3n.md @@ -32,8 +32,8 @@ this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented R [MatFormer][matformer], Per-Layer Embeddings (PLE), activation sparsity, and KV cache sharing. The language model uses a similar attention pattern to [Gemma 3](./gemma3.md) with alternating 4 local sliding window self-attention layers for every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces -[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a -[Universal Speech Model][usm] (USM) as the audio encoder. +[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly +trained audio encoder based on the [Universal Speech Model][usm] (USM) architecture. The instruction-tuned variant was post-trained with knowledge distillation and reinforcement learning. diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index ca1a067177..44621500c1 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -301,10 +301,10 @@ class Gemma3nTextConfig(PretrainedConfig): class Gemma3nAudioConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`], based on Gogole's - [Universal Speech Model](). It is used to instantiate an Gemma3nAudioEncoder model according to the specified - arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar - configuration to that of the Gemma 3n E4B, e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B). + This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`]. It is used to instantiate + an `Gemma3nAudioEncoder` model according to the specified arguments, defining the model architecture. Instantiating + a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g., + [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B). Configuration objects that inherit from [`Gemma3nAudioConfig`] and can be used to control the model outputs. Read the documentation from [`Gemma3nAudioConfig`] for more information. diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 0817e16451..d29bacf91e 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -911,7 +911,7 @@ class Gemma3nAudioConformerBlock(nn.Module): class Gemma3nAudioEncoder(PreTrainedModel): - """A Universal Speech Encoder -- https://arxiv.org/abs/2303.01037""" + """An audio encoder based on the [Universal Speech Model](https://arxiv.org/abs/2303.01037) architecture.""" config_class = Gemma3nAudioConfig diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index a3ffa710d8..a18ac8c2ef 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -313,10 +313,10 @@ class Gemma3nTextConfig(Gemma2Config, PretrainedConfig): class Gemma3nAudioConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`], based on Gogole's - [Universal Speech Model](). It is used to instantiate an Gemma3nAudioEncoder model according to the specified - arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar - configuration to that of the Gemma 3n E4B, e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B). + This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`]. It is used to instantiate + an `Gemma3nAudioEncoder` model according to the specified arguments, defining the model architecture. Instantiating + a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g., + [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B). Configuration objects that inherit from [`Gemma3nAudioConfig`] and can be used to control the model outputs. Read the documentation from [`Gemma3nAudioConfig`] for more information. @@ -1473,7 +1473,7 @@ class Gemma3nAudioConformerBlock(nn.Module): class Gemma3nAudioEncoder(PreTrainedModel): - """A Universal Speech Encoder -- https://arxiv.org/abs/2303.01037""" + """An audio encoder based on the [Universal Speech Model](https://arxiv.org/abs/2303.01037) architecture.""" config_class = Gemma3nAudioConfig