[CI] post-GptOss fixes for green CI (#39929)
This commit is contained in:
@@ -511,6 +511,8 @@
|
|||||||
title: GPT2
|
title: GPT2
|
||||||
- local: model_doc/gpt_bigcode
|
- local: model_doc/gpt_bigcode
|
||||||
title: GPTBigCode
|
title: GPTBigCode
|
||||||
|
- local: model_doc/gpt_oss
|
||||||
|
title: GptOss
|
||||||
- local: model_doc/gptsan-japanese
|
- local: model_doc/gptsan-japanese
|
||||||
title: GPTSAN Japanese
|
title: GPTSAN Japanese
|
||||||
- local: model_doc/gpt-sw3
|
- local: model_doc/gpt-sw3
|
||||||
@@ -617,8 +619,6 @@
|
|||||||
title: OLMoE
|
title: OLMoE
|
||||||
- local: model_doc/open-llama
|
- local: model_doc/open-llama
|
||||||
title: Open-Llama
|
title: Open-Llama
|
||||||
- local: model_doc/openai_moe
|
|
||||||
title: OpenAIMoe
|
|
||||||
- local: model_doc/opt
|
- local: model_doc/opt
|
||||||
title: OPT
|
title: OPT
|
||||||
- local: model_doc/pegasus
|
- local: model_doc/pegasus
|
||||||
|
|||||||
@@ -65,6 +65,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
|
|||||||
|
|
||||||
[[autodoc]] HqqConfig
|
[[autodoc]] HqqConfig
|
||||||
|
|
||||||
|
## Mxfp4Config
|
||||||
|
|
||||||
|
[[autodoc]] Mxfp4Config
|
||||||
|
|
||||||
## FbgemmFp8Config
|
## FbgemmFp8Config
|
||||||
|
|
||||||
[[autodoc]] FbgemmFp8Config
|
[[autodoc]] FbgemmFp8Config
|
||||||
|
|||||||
@@ -24,11 +24,11 @@ rendered properly in your Markdown viewer.
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
# OpenAIMoE
|
# GptOss
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
The OpenAIMoE model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
|
The GptOss model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
|
||||||
<INSERT SHORT SUMMARY HERE>
|
<INSERT SHORT SUMMARY HERE>
|
||||||
|
|
||||||
The abstract from the paper is the following:
|
The abstract from the paper is the following:
|
||||||
@@ -43,16 +43,16 @@ This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface
|
|||||||
The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
|
The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
|
||||||
|
|
||||||
|
|
||||||
## OpenAIMoeConfig
|
## GptOssConfig
|
||||||
|
|
||||||
[[autodoc]] OpenAIMoeConfig
|
[[autodoc]] GptOssConfig
|
||||||
|
|
||||||
## OpenAIMoeModel
|
## GptOssModel
|
||||||
|
|
||||||
[[autodoc]] OpenAIMoeModel
|
[[autodoc]] GptOssModel
|
||||||
- forward
|
- forward
|
||||||
|
|
||||||
## OpenAIMoeForCausalLM
|
## GptOssForCausalLM
|
||||||
|
|
||||||
[[autodoc]] OpenAIMoeForCausalLM
|
[[autodoc]] GptOssForCausalLM
|
||||||
- forward
|
- forward
|
||||||
@@ -40,7 +40,7 @@ if is_torch_flex_attn_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.jetmoe.modeling_jetmoe.load_balancing_loss_func
|
# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
|
||||||
def load_balancing_loss_func(
|
def load_balancing_loss_func(
|
||||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||||
num_experts: Optional[int] = None,
|
num_experts: Optional[int] = None,
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ is_fast_path_available = all(
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func with gate->router
|
# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func with gate->router
|
||||||
def load_balancing_loss_func(
|
def load_balancing_loss_func(
|
||||||
router_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
router_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||||
num_experts: Optional[int] = None,
|
num_experts: Optional[int] = None,
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ if is_flash_attn_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
|
# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
|
||||||
def load_balancing_loss_func(
|
def load_balancing_loss_func(
|
||||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||||
num_experts: Optional[int] = None,
|
num_experts: Optional[int] = None,
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ if is_flash_attn_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
|
# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
|
||||||
def load_balancing_loss_func(
|
def load_balancing_loss_func(
|
||||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||||
num_experts: Optional[int] = None,
|
num_experts: Optional[int] = None,
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_m
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
|
# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
|
||||||
def load_balancing_loss_func(
|
def load_balancing_loss_func(
|
||||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||||
num_experts: Optional[int] = None,
|
num_experts: Optional[int] = None,
|
||||||
|
|||||||
@@ -59,7 +59,6 @@ if is_torch_flex_attn_available():
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
|
|
||||||
def load_balancing_loss_func(
|
def load_balancing_loss_func(
|
||||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||||
num_experts: Optional[int] = None,
|
num_experts: Optional[int] = None,
|
||||||
|
|||||||
@@ -345,6 +345,8 @@ SPECIAL_CASES_TO_ALLOW.update(
|
|||||||
"IdeficsConfig": True,
|
"IdeficsConfig": True,
|
||||||
"IdeficsVisionConfig": True,
|
"IdeficsVisionConfig": True,
|
||||||
"IdeficsPerceiverConfig": True,
|
"IdeficsPerceiverConfig": True,
|
||||||
|
# TODO: @Arthur/Joao (`hidden_act` unused)
|
||||||
|
"GptOssConfig": True,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user