From daab2db33f8d3f4fe4c019da3765353548fce7a9 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 5 Aug 2025 19:04:59 +0100
Subject: [PATCH] [CI] post-`GptOss` fixes for green CI (#39929)

---
 docs/source/en/_toctree.yml                      |  4 ++--
 docs/source/en/main_classes/quantization.md      |  4 ++++
 .../en/model_doc/{openai_moe.md => gpt_oss.md}   | 16 ++++++++--------
 .../models/granitemoe/modeling_granitemoe.py     |  2 +-
 src/transformers/models/jamba/modeling_jamba.py  |  2 +-
 .../models/jetmoe/modeling_jetmoe.py             |  2 +-
 src/transformers/models/olmoe/modeling_olmoe.py  |  2 +-
 .../models/phimoe/modeling_phimoe.py             |  2 +-
 .../models/qwen2_moe/modeling_qwen2_moe.py       |  1 -
 utils/check_config_attributes.py                 |  2 ++
 10 files changed, 21 insertions(+), 16 deletions(-)
 rename docs/source/en/model_doc/{openai_moe.md => gpt_oss.md} (94%)
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 996029b00b..556b19f011 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -511,6 +511,8 @@
         title: GPT2
       - local: model_doc/gpt_bigcode
         title: GPTBigCode
+      - local: model_doc/gpt_oss
+        title: GptOss
       - local: model_doc/gptsan-japanese
         title: GPTSAN Japanese
       - local: model_doc/gpt-sw3
@@ -617,8 +619,6 @@
         title: OLMoE
       - local: model_doc/open-llama
         title: Open-Llama
-      - local: model_doc/openai_moe
-        title: OpenAIMoe
       - local: model_doc/opt
         title: OPT
       - local: model_doc/pegasus
diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index 992f629e5a..e1f4940103 100755
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -65,6 +65,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 
 [[autodoc]] HqqConfig
 
+## Mxfp4Config
+
+[[autodoc]] Mxfp4Config
+
 ## FbgemmFp8Config
 
 [[autodoc]] FbgemmFp8Config
diff --git a/docs/source/en/model_doc/openai_moe.md b/docs/source/en/model_doc/gpt_oss.md
similarity index 94%
rename from docs/source/en/model_doc/openai_moe.md
rename to docs/source/en/model_doc/gpt_oss.md
index 2c0b39013d..9b368bdc9e 100644
--- a/docs/source/en/model_doc/openai_moe.md
+++ b/docs/source/en/model_doc/gpt_oss.md
@@ -24,11 +24,11 @@ rendered properly in your Markdown viewer.
     </div>
 </div>
 
-# OpenAIMoE
+# GptOss
 
 ## Overview
 
-The OpenAIMoE model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+The GptOss model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
 <INSERT SHORT SUMMARY HERE>
 
 The abstract from the paper is the following:
@@ -43,16 +43,16 @@ This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface
 The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 
-## OpenAIMoeConfig
+## GptOssConfig
 
-[[autodoc]] OpenAIMoeConfig
+[[autodoc]] GptOssConfig
 
-## OpenAIMoeModel
+## GptOssModel
 
-[[autodoc]] OpenAIMoeModel
+[[autodoc]] GptOssModel
     - forward
 
-## OpenAIMoeForCausalLM
+## GptOssForCausalLM
 
-[[autodoc]] OpenAIMoeForCausalLM
+[[autodoc]] GptOssForCausalLM
     - forward
diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py
index 96fc1ca337..8fe6d2f1dc 100644
--- a/src/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -40,7 +40,7 @@ if is_torch_flex_attn_available():
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.jetmoe.modeling_jetmoe.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
     gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py
index 191e82e8e8..34af2f2f2e 100755
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@@ -67,7 +67,7 @@ is_fast_path_available = all(
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func with gate->router
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func with gate->router
 def load_balancing_loss_func(
     router_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py
index 9978859441..9e0784152c 100644
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -50,7 +50,7 @@ if is_flash_attn_available():
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
     gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
index c9540e33af..d3b57883ef 100644
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -39,7 +39,7 @@ if is_flash_attn_available():
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
     gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py
index a4735a04ac..8510cd3e8a 100644
--- a/src/transformers/models/phimoe/modeling_phimoe.py
+++ b/src/transformers/models/phimoe/modeling_phimoe.py
@@ -55,7 +55,7 @@ _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_m
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
     gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index a9cc23b37e..8a46795b47 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -59,7 +59,6 @@ if is_torch_flex_attn_available():
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
 def load_balancing_loss_func(
     gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
     num_experts: Optional[int] = None,
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 79904b8c2a..82d2b7000e 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -345,6 +345,8 @@ SPECIAL_CASES_TO_ALLOW.update(
         "IdeficsConfig": True,
         "IdeficsVisionConfig": True,
         "IdeficsPerceiverConfig": True,
+        # TODO: @Arthur/Joao (`hidden_act` unused)
+        "GptOssConfig": True,
     }
 )