[CI] post-GptOss fixes for green CI (#39929)
This commit is contained in:
@@ -511,6 +511,8 @@
|
||||
title: GPT2
|
||||
- local: model_doc/gpt_bigcode
|
||||
title: GPTBigCode
|
||||
- local: model_doc/gpt_oss
|
||||
title: GptOss
|
||||
- local: model_doc/gptsan-japanese
|
||||
title: GPTSAN Japanese
|
||||
- local: model_doc/gpt-sw3
|
||||
@@ -617,8 +619,6 @@
|
||||
title: OLMoE
|
||||
- local: model_doc/open-llama
|
||||
title: Open-Llama
|
||||
- local: model_doc/openai_moe
|
||||
title: OpenAIMoe
|
||||
- local: model_doc/opt
|
||||
title: OPT
|
||||
- local: model_doc/pegasus
|
||||
|
||||
@@ -65,6 +65,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
|
||||
|
||||
[[autodoc]] HqqConfig
|
||||
|
||||
## Mxfp4Config
|
||||
|
||||
[[autodoc]] Mxfp4Config
|
||||
|
||||
## FbgemmFp8Config
|
||||
|
||||
[[autodoc]] FbgemmFp8Config
|
||||
|
||||
@@ -24,11 +24,11 @@ rendered properly in your Markdown viewer.
|
||||
</div>
|
||||
</div>
|
||||
|
||||
# OpenAIMoE
|
||||
# GptOss
|
||||
|
||||
## Overview
|
||||
|
||||
The OpenAIMoE model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
|
||||
The GptOss model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
|
||||
<INSERT SHORT SUMMARY HERE>
|
||||
|
||||
The abstract from the paper is the following:
|
||||
@@ -43,16 +43,16 @@ This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface
|
||||
The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
|
||||
|
||||
|
||||
## OpenAIMoeConfig
|
||||
## GptOssConfig
|
||||
|
||||
[[autodoc]] OpenAIMoeConfig
|
||||
[[autodoc]] GptOssConfig
|
||||
|
||||
## OpenAIMoeModel
|
||||
## GptOssModel
|
||||
|
||||
[[autodoc]] OpenAIMoeModel
|
||||
[[autodoc]] GptOssModel
|
||||
- forward
|
||||
|
||||
## OpenAIMoeForCausalLM
|
||||
## GptOssForCausalLM
|
||||
|
||||
[[autodoc]] OpenAIMoeForCausalLM
|
||||
[[autodoc]] GptOssForCausalLM
|
||||
- forward
|
||||
@@ -40,7 +40,7 @@ if is_torch_flex_attn_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# Copied from transformers.models.jetmoe.modeling_jetmoe.load_balancing_loss_func
|
||||
# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
|
||||
def load_balancing_loss_func(
|
||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||
num_experts: Optional[int] = None,
|
||||
|
||||
@@ -67,7 +67,7 @@ is_fast_path_available = all(
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func with gate->router
|
||||
# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func with gate->router
|
||||
def load_balancing_loss_func(
|
||||
router_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||
num_experts: Optional[int] = None,
|
||||
|
||||
@@ -50,7 +50,7 @@ if is_flash_attn_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
|
||||
# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
|
||||
def load_balancing_loss_func(
|
||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||
num_experts: Optional[int] = None,
|
||||
|
||||
@@ -39,7 +39,7 @@ if is_flash_attn_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
|
||||
# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
|
||||
def load_balancing_loss_func(
|
||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||
num_experts: Optional[int] = None,
|
||||
|
||||
@@ -55,7 +55,7 @@ _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_m
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
|
||||
# Copied from transformers.models.qwen2_moe.modeling_qwen2_moe.load_balancing_loss_func
|
||||
def load_balancing_loss_func(
|
||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||
num_experts: Optional[int] = None,
|
||||
|
||||
@@ -59,7 +59,6 @@ if is_torch_flex_attn_available():
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
|
||||
def load_balancing_loss_func(
|
||||
gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
|
||||
num_experts: Optional[int] = None,
|
||||
|
||||
@@ -345,6 +345,8 @@ SPECIAL_CASES_TO_ALLOW.update(
|
||||
"IdeficsConfig": True,
|
||||
"IdeficsVisionConfig": True,
|
||||
"IdeficsPerceiverConfig": True,
|
||||
# TODO: @Arthur/Joao (`hidden_act` unused)
|
||||
"GptOssConfig": True,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user