From ed9f252608389c4e5cb2c5a94f5cc47d76855842 Mon Sep 17 00:00:00 2001
From: Ryan Mullins <ryanmullins@google.com>
Date: Mon, 30 Jun 2025 08:10:51 -0400
Subject: [PATCH] docs: Gemma 3n audio encoder (#39087)

Updating Gemma 3n docs and docstrings to clarify the relationship
between the newly trained audio encoder used in Gemma 3n and the USM
model from the original paper.
---
 docs/source/en/model_doc/gemma3n.md                    |  4 ++--
 .../models/gemma3n/configuration_gemma3n.py            |  8 ++++----
 src/transformers/models/gemma3n/modeling_gemma3n.py    |  2 +-
 src/transformers/models/gemma3n/modular_gemma3n.py     | 10 +++++-----
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/source/en/model_doc/gemma3n.md b/docs/source/en/model_doc/gemma3n.md
index 7f38c3b18c..d38368e829 100644
--- a/docs/source/en/model_doc/gemma3n.md
+++ b/docs/source/en/model_doc/gemma3n.md
@@ -32,8 +32,8 @@ this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented R
 [MatFormer][matformer], Per-Layer Embeddings (PLE), activation sparsity, and KV cache sharing. The language model uses
 a similar attention pattern to [Gemma 3](./gemma3.md) with alternating 4 local sliding window self-attention layers for
 every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
-[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a
-[Universal Speech Model][usm] (USM) as the audio encoder.
+[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
+trained audio encoder based on the [Universal Speech Model][usm] (USM) architecture.
 
 The instruction-tuned variant was post-trained with knowledge distillation and reinforcement learning.
 
diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py
index ca1a067177..44621500c1 100644
--- a/src/transformers/models/gemma3n/configuration_gemma3n.py
+++ b/src/transformers/models/gemma3n/configuration_gemma3n.py
@@ -301,10 +301,10 @@ class Gemma3nTextConfig(PretrainedConfig):
 
 class Gemma3nAudioConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`], based on Gogole's
-    [Universal Speech Model](). It is used to instantiate an Gemma3nAudioEncoder model according to the specified
-    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the Gemma 3n E4B, e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).
+    This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`]. It is used to instantiate
+    an `Gemma3nAudioEncoder` model according to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g.,
+    [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).
 
     Configuration objects that inherit from [`Gemma3nAudioConfig`] and can be used to control the model outputs. Read
     the documentation from [`Gemma3nAudioConfig`] for more information.
diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py
index 0817e16451..d29bacf91e 100644
--- a/src/transformers/models/gemma3n/modeling_gemma3n.py
+++ b/src/transformers/models/gemma3n/modeling_gemma3n.py
@@ -911,7 +911,7 @@ class Gemma3nAudioConformerBlock(nn.Module):
 
 
 class Gemma3nAudioEncoder(PreTrainedModel):
-    """A Universal Speech Encoder -- https://arxiv.org/abs/2303.01037"""
+    """An audio encoder based on the [Universal Speech Model](https://arxiv.org/abs/2303.01037) architecture."""
 
     config_class = Gemma3nAudioConfig
 
diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py
index a3ffa710d8..a18ac8c2ef 100644
--- a/src/transformers/models/gemma3n/modular_gemma3n.py
+++ b/src/transformers/models/gemma3n/modular_gemma3n.py
@@ -313,10 +313,10 @@ class Gemma3nTextConfig(Gemma2Config, PretrainedConfig):
 
 class Gemma3nAudioConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`], based on Gogole's
-    [Universal Speech Model](). It is used to instantiate an Gemma3nAudioEncoder model according to the specified
-    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the Gemma 3n E4B, e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).
+    This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`]. It is used to instantiate
+    an `Gemma3nAudioEncoder` model according to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g.,
+    [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B).
 
     Configuration objects that inherit from [`Gemma3nAudioConfig`] and can be used to control the model outputs. Read
     the documentation from [`Gemma3nAudioConfig`] for more information.
@@ -1473,7 +1473,7 @@ class Gemma3nAudioConformerBlock(nn.Module):
 
 
 class Gemma3nAudioEncoder(PreTrainedModel):
-    """A Universal Speech Encoder -- https://arxiv.org/abs/2303.01037"""
+    """An audio encoder based on the [Universal Speech Model](https://arxiv.org/abs/2303.01037) architecture."""
 
     config_class = Gemma3nAudioConfig