[CI] post-GptOss fixes for green CI (#39929)

2025-08-05 19:04:59 +01:00
parent 06f8004e5c
commit daab2db33f
10 changed files with 21 additions and 16 deletions
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -511,6 +511,8 @@
        title: GPT2
      - local: model_doc/gpt_bigcode
        title: GPTBigCode
      - local: model_doc/gpt_oss
        title: GptOss
      - local: model_doc/gptsan-japanese
        title: GPTSAN Japanese
      - local: model_doc/gpt-sw3
@@ -617,8 +619,6 @@
        title: OLMoE
      - local: model_doc/open-llama
        title: Open-Llama
      - local: model_doc/openai_moe
        title: OpenAIMoe
      - local: model_doc/opt
        title: OPT
      - local: model_doc/pegasus
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -65,6 +65,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 [[autodoc]] HqqConfig
 ## Mxfp4Config
 [[autodoc]] Mxfp4Config
 ## FbgemmFp8Config
 [[autodoc]] FbgemmFp8Config
--- a/docs/source/en/model_doc/openai_moe.md
+++ b/docs/source/en/model_doc/openai_moe.md
@@ -24,11 +24,11 @@ rendered properly in your Markdown viewer.
    </div>
 </div>
-# OpenAIMoE
+# GptOss
 ## Overview
-The OpenAIMoE model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+The GptOss model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
 <INSERT SHORT SUMMARY HERE>
 The abstract from the paper is the following:
@@ -43,16 +43,16 @@ This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface
 The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
-## OpenAIMoeConfig
+## GptOssConfig
-[[autodoc]] OpenAIMoeConfig
+[[autodoc]] GptOssConfig
-## OpenAIMoeModel
+## GptOssModel
-[[autodoc]] OpenAIMoeModel
+[[autodoc]] GptOssModel
    - forward
-## OpenAIMoeForCausalLM
+## GptOssForCausalLM
-[[autodoc]] OpenAIMoeForCausalLM
+[[autodoc]] GptOssForCausalLM
    - forward
--- a/src/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -40,7 +40,7 @@ if is_torch_flex_attn_available():
 logger = logging.get_logger(__name__)
-# Copied from transformers.models.jetmoe.modeling_jetmoe.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@@ -67,7 +67,7 @@ is_fast_path_available = all(
 logger = logging.get_logger(__name__)
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func with gate->router
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func with gate->router
 def load_balancing_loss_func(
    router_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -50,7 +50,7 @@ if is_flash_attn_available():
 logger = logging.get_logger(__name__)
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -39,7 +39,7 @@ if is_flash_attn_available():
 logger = logging.get_logger(__name__)
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
--- a/src/transformers/models/phimoe/modeling_phimoe.py
+++ b/src/transformers/models/phimoe/modeling_phimoe.py
@@ -55,7 +55,7 @@ _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_m
 logger = logging.get_logger(__name__)
-# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
 def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -59,7 +59,6 @@ if is_torch_flex_attn_available():
 logger = logging.get_logger(__name__)
 # Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
 def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -345,6 +345,8 @@ SPECIAL_CASES_TO_ALLOW.update(
        "IdeficsConfig": True,
        "IdeficsVisionConfig": True,
        "IdeficsPerceiverConfig": True,
        # TODO: @Arthur/Joao (`hidden_act` unused)
        "GptOssConfig": True,
    }
 )