From daab2db33f8d3f4fe4c019da3765353548fce7a9 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 5 Aug 2025 19:04:59 +0100 Subject: [PATCH] [CI] post-`GptOss` fixes for green CI (#39929) --- docs/source/en/_toctree.yml | 4 ++-- docs/source/en/main_classes/quantization.md | 4 ++++ .../en/model_doc/{openai_moe.md => gpt_oss.md} | 16 ++++++++-------- .../models/granitemoe/modeling_granitemoe.py | 2 +- src/transformers/models/jamba/modeling_jamba.py | 2 +- .../models/jetmoe/modeling_jetmoe.py | 2 +- src/transformers/models/olmoe/modeling_olmoe.py | 2 +- .../models/phimoe/modeling_phimoe.py | 2 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 1 - utils/check_config_attributes.py | 2 ++ 10 files changed, 21 insertions(+), 16 deletions(-) rename docs/source/en/model_doc/{openai_moe.md => gpt_oss.md} (94%) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 996029b00b..556b19f011 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -511,6 +511,8 @@ title: GPT2 - local: model_doc/gpt_bigcode title: GPTBigCode + - local: model_doc/gpt_oss + title: GptOss - local: model_doc/gptsan-japanese title: GPTSAN Japanese - local: model_doc/gpt-sw3 @@ -617,8 +619,6 @@ title: OLMoE - local: model_doc/open-llama title: Open-Llama - - local: model_doc/openai_moe - title: OpenAIMoe - local: model_doc/opt title: OPT - local: model_doc/pegasus diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md index 992f629e5a..e1f4940103 100755 --- a/docs/source/en/main_classes/quantization.md +++ b/docs/source/en/main_classes/quantization.md @@ -65,6 +65,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide. [[autodoc]] HqqConfig +## Mxfp4Config + +[[autodoc]] Mxfp4Config + ## FbgemmFp8Config [[autodoc]] FbgemmFp8Config diff --git a/docs/source/en/model_doc/openai_moe.md b/docs/source/en/model_doc/gpt_oss.md similarity index 94% rename from docs/source/en/model_doc/openai_moe.md rename to docs/source/en/model_doc/gpt_oss.md index 2c0b39013d..9b368bdc9e 100644 --- a/docs/source/en/model_doc/openai_moe.md +++ b/docs/source/en/model_doc/gpt_oss.md @@ -24,11 +24,11 @@ rendered properly in your Markdown viewer. -# OpenAIMoE +# GptOss ## Overview -The OpenAIMoE model was proposed in []() by . +The GptOss model was proposed in []() by . The abstract from the paper is the following: @@ -43,16 +43,16 @@ This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface The original code can be found [here](). -## OpenAIMoeConfig +## GptOssConfig -[[autodoc]] OpenAIMoeConfig +[[autodoc]] GptOssConfig -## OpenAIMoeModel +## GptOssModel -[[autodoc]] OpenAIMoeModel +[[autodoc]] GptOssModel - forward -## OpenAIMoeForCausalLM +## GptOssForCausalLM -[[autodoc]] OpenAIMoeForCausalLM +[[autodoc]] GptOssForCausalLM - forward diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index 96fc1ca337..8fe6d2f1dc 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -40,7 +40,7 @@ if is_torch_flex_attn_available(): logger = logging.get_logger(__name__) -# Copied from transformers.models.jetmoe.modeling_jetmoe.load_balancing_loss_func +# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func def load_balancing_loss_func( gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None], num_experts: Optional[int] = None, diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 191e82e8e8..34af2f2f2e 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -67,7 +67,7 @@ is_fast_path_available = all( logger = logging.get_logger(__name__) -# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func with gate->router +# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func with gate->router def load_balancing_loss_func( router_logits: Union[torch.Tensor, tuple[torch.Tensor], None], num_experts: Optional[int] = None, diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index 9978859441..9e0784152c 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -50,7 +50,7 @@ if is_flash_attn_available(): logger = logging.get_logger(__name__) -# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func +# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func def load_balancing_loss_func( gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None], num_experts: Optional[int] = None, diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index c9540e33af..d3b57883ef 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -39,7 +39,7 @@ if is_flash_attn_available(): logger = logging.get_logger(__name__) -# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func +# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func def load_balancing_loss_func( gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None], num_experts: Optional[int] = None, diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index a4735a04ac..8510cd3e8a 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -55,7 +55,7 @@ _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_m logger = logging.get_logger(__name__) -# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func +# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func def load_balancing_loss_func( gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None], num_experts: Optional[int] = None, diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index a9cc23b37e..8a46795b47 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -59,7 +59,6 @@ if is_torch_flex_attn_available(): logger = logging.get_logger(__name__) -# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func def load_balancing_loss_func( gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None], num_experts: Optional[int] = None, diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 79904b8c2a..82d2b7000e 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -345,6 +345,8 @@ SPECIAL_CASES_TO_ALLOW.update( "IdeficsConfig": True, "IdeficsVisionConfig": True, "IdeficsPerceiverConfig": True, + # TODO: @Arthur/Joao (`hidden_act` unused) + "GptOssConfig": True, } )