@@ -38,7 +38,7 @@ However, no method can be called on that object:
|
|||||||
```python
|
```python
|
||||||
>>> DetrImageProcessorFast.from_pretrained()
|
>>> DetrImageProcessorFast.from_pretrained()
|
||||||
ImportError:
|
ImportError:
|
||||||
DetrImageProcessorFast requires the Torchvision library but it was not found in your environment. Checkout the instructions on the
|
DetrImageProcessorFast requires the Torchvision library but it was not found in your environment. Check out the instructions on the
|
||||||
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
|
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
|
||||||
Please note that you may need to restart your runtime after installation.
|
Please note that you may need to restart your runtime after installation.
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -546,7 +546,7 @@ def main():
|
|||||||
# region Tokenizer check: this script requires a fast tokenizer.
|
# region Tokenizer check: this script requires a fast tokenizer.
|
||||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"This example script only works for models that have a fast tokenizer. Checkout the big table of models at"
|
"This example script only works for models that have a fast tokenizer. Check out the big table of models at"
|
||||||
" https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
|
" https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
|
||||||
" this requirement"
|
" this requirement"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ class MyNewModelConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ class NewModelConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
head_dim (`int`, *optional*, defaults to 256):
|
head_dim (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -357,7 +357,7 @@ def main():
|
|||||||
# Tokenizer check: this script requires a fast tokenizer.
|
# Tokenizer check: this script requires a fast tokenizer.
|
||||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"This example script only works for models that have a fast tokenizer. Checkout the big table of models at"
|
"This example script only works for models that have a fast tokenizer. Check out the big table of models at"
|
||||||
" https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
|
" https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
|
||||||
" this requirement"
|
" this requirement"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -399,7 +399,7 @@ def main():
|
|||||||
# Tokenizer check: this script requires a fast tokenizer.
|
# Tokenizer check: this script requires a fast tokenizer.
|
||||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"This example script only works for models that have a fast tokenizer. Checkout the big table of models at"
|
"This example script only works for models that have a fast tokenizer. Check out the big table of models at"
|
||||||
" https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
|
" https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
|
||||||
" this requirement"
|
" this requirement"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -378,7 +378,7 @@ def main():
|
|||||||
# region Tokenizer check: this script requires a fast tokenizer.
|
# region Tokenizer check: this script requires a fast tokenizer.
|
||||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"This example script only works for models that have a fast tokenizer. Checkout the big table of models at"
|
"This example script only works for models that have a fast tokenizer. Check out the big table of models at"
|
||||||
" https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
|
" https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet"
|
||||||
" this requirement"
|
" this requirement"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class AriaTextConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ class AriaTextConfig(LlamaConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ class BambaConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ class BitNetConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ class ChameleonConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -446,7 +446,7 @@ def main():
|
|||||||
"--model_size",
|
"--model_size",
|
||||||
choices=["7B", "30B"],
|
choices=["7B", "30B"],
|
||||||
help=""
|
help=""
|
||||||
" models correspond to the finetuned versions, and are specific to the Chameleon official release. For more details on Chameleon, checkout the original repo: https://github.com/facebookresearch/chameleon",
|
" models correspond to the finetuned versions, and are specific to the Chameleon official release. For more details on Chameleon, check out the original repo: https://github.com/facebookresearch/chameleon",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output_dir",
|
"--output_dir",
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ class CohereConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ class Cohere2Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ class Cohere2Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class CsmDepthDecoderConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
@@ -235,7 +235,7 @@ class CsmConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf).
|
paper](https://arxiv.org/pdf/2305.13245.pdf).
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the backbone model Transformer decoder.
|
The non-linear activation function (function or string) in the backbone model Transformer decoder.
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ class DeepseekV3Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
n_shared_experts (`int`, *optional*, defaults to 1):
|
n_shared_experts (`int`, *optional*, defaults to 1):
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ class DiffLlamaConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -138,7 +138,7 @@ class Emu3TextConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class FalconH1Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ class GemmaConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
head_dim (`int`, *optional*, defaults to 256):
|
head_dim (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -151,7 +151,7 @@ def main():
|
|||||||
"--model_size",
|
"--model_size",
|
||||||
default="7B",
|
default="7B",
|
||||||
choices=["2B", "7B", "tokenizer_only"],
|
choices=["2B", "7B", "tokenizer_only"],
|
||||||
help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b",
|
help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, check out the original repo: https://huggingface.co/google/gemma-7b",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output_dir",
|
"--output_dir",
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ class GemmaConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
head_dim (`int`, *optional*, defaults to 256):
|
head_dim (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ class Gemma2Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
head_dim (`int`, *optional*, defaults to 256):
|
head_dim (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -184,7 +184,7 @@ def main():
|
|||||||
"--model_size",
|
"--model_size",
|
||||||
default="9B",
|
default="9B",
|
||||||
choices=["9B", "27B", "tokenizer_only"],
|
choices=["9B", "27B", "tokenizer_only"],
|
||||||
help="'f' models correspond to the finetuned versions, and are specific to the Gemma22 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b",
|
help="'f' models correspond to the finetuned versions, and are specific to the Gemma22 official release. For more details on Gemma2, check out the original repo: https://huggingface.co/google/gemma-7b",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output_dir",
|
"--output_dir",
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ class Gemma2Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
head_dim (`int`, *optional*, defaults to 256):
|
head_dim (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ class Gemma3TextConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
head_dim (`int`, *optional*, defaults to 256):
|
head_dim (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -82,7 +82,7 @@ class Gemma3TextConfig(Gemma2Config, PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
head_dim (`int`, *optional*, defaults to 256):
|
head_dim (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ class GlmConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
|
partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ class Glm4Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
|
partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position.
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class GraniteConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class GraniteMoeConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class GraniteMoeHybridConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class GraniteMoeSharedConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ class HeliumConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
head_dim (`int`, *optional*, defaults to 128):
|
head_dim (`int`, *optional*, defaults to 128):
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ class JambaConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ class LlamaConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -528,7 +528,7 @@ def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--model_size",
|
"--model_size",
|
||||||
default=None,
|
default=None,
|
||||||
help="'f' Deprecated in favor of `num_shards`: models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
|
help="'f' Deprecated in favor of `num_shards`: models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, check out the original repo: https://huggingface.co/meta-llama",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output_dir",
|
"--output_dir",
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ class MimiConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||||
head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
|
head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
|
||||||
The attention head dimension.
|
The attention head dimension.
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ class MiniMaxConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||||
head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
|
head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
|
||||||
The attention head dimension.
|
The attention head dimension.
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ class MiniMaxConfig(MixtralConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||||
head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
|
head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
|
||||||
The attention head dimension.
|
The attention head dimension.
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ class MistralConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||||
head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
|
head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
|
||||||
The attention head dimension.
|
The attention head dimension.
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ class MixtralConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||||
head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
|
head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
|
||||||
The attention head dimension.
|
The attention head dimension.
|
||||||
|
|||||||
@@ -227,7 +227,7 @@ def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--model_size",
|
"--model_size",
|
||||||
choices=["7B"],
|
choices=["7B"],
|
||||||
help="'f' models correspond to the finetuned versions, and are specific to the Mixtral official release. For more details on Mixtral, checkout the original repo: https://huggingface.co/mistral-ai",
|
help="'f' models correspond to the finetuned versions, and are specific to the Mixtral official release. For more details on Mixtral, check out the original repo: https://huggingface.co/mistral-ai",
|
||||||
default="7B",
|
default="7B",
|
||||||
)
|
)
|
||||||
parser.add_argument("--output_dir", help="Location to write HF model", required=True)
|
parser.add_argument("--output_dir", help="Location to write HF model", required=True)
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ class MoonshineConfig(PretrainedConfig):
|
|||||||
`encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
decoder_num_key_value_heads (`int`, *optional*):
|
decoder_num_key_value_heads (`int`, *optional*):
|
||||||
@@ -61,7 +61,7 @@ class MoonshineConfig(PretrainedConfig):
|
|||||||
`decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`decoder_num_attention_heads`.
|
`decoder_num_attention_heads`.
|
||||||
pad_head_dim_to_multiple_of (`int`, *optional*):
|
pad_head_dim_to_multiple_of (`int`, *optional*):
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ class MoonshineConfig(PretrainedConfig):
|
|||||||
`encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
decoder_num_key_value_heads (`int`, *optional*):
|
decoder_num_key_value_heads (`int`, *optional*):
|
||||||
@@ -83,7 +83,7 @@ class MoonshineConfig(PretrainedConfig):
|
|||||||
`decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`decoder_num_attention_heads`.
|
`decoder_num_attention_heads`.
|
||||||
pad_head_dim_to_multiple_of (`int`, *optional*):
|
pad_head_dim_to_multiple_of (`int`, *optional*):
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ class MoshiDepthConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`.
|
||||||
audio_vocab_size (`int`, *optional*, defaults to 2048):
|
audio_vocab_size (`int`, *optional*, defaults to 2048):
|
||||||
Vocabulary size of the audio part of model. Defines the number of different tokens that can be
|
Vocabulary size of the audio part of model. Defines the number of different tokens that can be
|
||||||
@@ -171,7 +171,7 @@ class MoshiConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`.
|
||||||
audio_vocab_size (`int`, *optional*):
|
audio_vocab_size (`int`, *optional*):
|
||||||
Vocabulary size of the audio part of model. Defines the number of different tokens that can be
|
Vocabulary size of the audio part of model. Defines the number of different tokens that can be
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ class NemotronConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ class OlmoConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class Olmo2Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class Olmo2Config(OlmoConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ class OlmoeConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class PhiConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class Phi3Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
||||||
|
|||||||
@@ -268,7 +268,7 @@ class Phi4MultimodalConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
||||||
|
|||||||
@@ -304,7 +304,7 @@ class Phi4MultimodalConfig(Phi3Config):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ class PhimoeConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class Qwen2Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -238,7 +238,7 @@ class Qwen2_5OmniTextConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
@@ -584,7 +584,7 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -277,7 +277,7 @@ class Qwen2_5OmniTextConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
@@ -623,7 +623,7 @@ class Qwen2_5OmniTalkerConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -94,7 +94,7 @@ class Qwen2_5_VLTextConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class Qwen2MoeConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ class Qwen2VLTextConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class Qwen3Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||||
head_dim (`int`, *optional*, defaults to 128):
|
head_dim (`int`, *optional*, defaults to 128):
|
||||||
The attention head dimension.
|
The attention head dimension.
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class Qwen3MoeConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -167,7 +167,7 @@ def main():
|
|||||||
"--model_size",
|
"--model_size",
|
||||||
default="2B",
|
default="2B",
|
||||||
choices=["2B", "7B", "tokenizer_only"],
|
choices=["2B", "7B", "tokenizer_only"],
|
||||||
help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b",
|
help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, check out the original repo: https://huggingface.co/google/gemma-7b",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output_dir",
|
"--output_dir",
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ class StableLmConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
||||||
`num_attention_heads`.
|
`num_attention_heads`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class Starcoder2Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
|
||||||
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
|
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
|
||||||
The non-linear activation function (function or string) in the decoder.
|
The non-linear activation function (function or string) in the decoder.
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ class ZambaConfig(PretrainedConfig):
|
|||||||
`num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf).
|
paper](https://arxiv.org/pdf/2305.13245.pdf).
|
||||||
n_mamba_heads (`int`, *optional*, defaults to 2):
|
n_mamba_heads (`int`, *optional*, defaults to 2):
|
||||||
Number of mamba heads for each mamba layer.
|
Number of mamba heads for each mamba layer.
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ class Zamba2Config(PretrainedConfig):
|
|||||||
`num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if
|
`num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if
|
||||||
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
||||||
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
||||||
by meanpooling all the original heads within that group. For more details checkout [this
|
by meanpooling all the original heads within that group. For more details, check out [this
|
||||||
paper](https://arxiv.org/pdf/2305.13245.pdf).
|
paper](https://arxiv.org/pdf/2305.13245.pdf).
|
||||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ class BitNetHfQuantizer(HfQuantizer):
|
|||||||
1.58-bit quantization from BitNet quantization method:
|
1.58-bit quantization from BitNet quantization method:
|
||||||
Before loading: it converts the linear layers into BitLinear layers during loading.
|
Before loading: it converts the linear layers into BitLinear layers during loading.
|
||||||
|
|
||||||
Checkout the paper introducing this method : https://arxiv.org/pdf/2402.17764
|
Check out the paper introducing this method : https://arxiv.org/pdf/2402.17764
|
||||||
"""
|
"""
|
||||||
|
|
||||||
requires_parameters_quantization = False
|
requires_parameters_quantization = False
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOM
|
|||||||
default_cache_path = constants.default_cache_path
|
default_cache_path = constants.default_cache_path
|
||||||
|
|
||||||
# Determine default cache directory. Lots of legacy environment variables to ensure backward compatibility.
|
# Determine default cache directory. Lots of legacy environment variables to ensure backward compatibility.
|
||||||
# The best way to set the cache path is with the environment variable HF_HOME. For more details, checkout this
|
# The best way to set the cache path is with the environment variable HF_HOME. For more details, check out this
|
||||||
# documentation page: https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables.
|
# documentation page: https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables.
|
||||||
#
|
#
|
||||||
# In code, use `HF_HUB_CACHE` as the default cache path. This variable is set by the library and is guaranteed
|
# In code, use `HF_HUB_CACHE` as the default cache path. This variable is set by the library and is guaranteed
|
||||||
@@ -542,7 +542,7 @@ def cached_files(
|
|||||||
elif _raise_exceptions_for_missing_entries:
|
elif _raise_exceptions_for_missing_entries:
|
||||||
raise OSError(
|
raise OSError(
|
||||||
f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load the files, and couldn't find them in the"
|
f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load the files, and couldn't find them in the"
|
||||||
f" cached files.\nCheckout your internet connection or see how to run the library in offline mode at"
|
f" cached files.\nCheck your internet connection or see how to run the library in offline mode at"
|
||||||
" 'https://huggingface.co/docs/transformers/installation#offline-mode'."
|
" 'https://huggingface.co/docs/transformers/installation#offline-mode'."
|
||||||
) from e
|
) from e
|
||||||
# snapshot_download will not raise EntryNotFoundError, but hf_hub_download can. If this is the case, it will be treated
|
# snapshot_download will not raise EntryNotFoundError, but hf_hub_download can. If this is the case, it will be treated
|
||||||
|
|||||||
@@ -1492,7 +1492,7 @@ Please note that you may need to restart your runtime after installation.
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
SENTENCEPIECE_IMPORT_ERROR = """
|
SENTENCEPIECE_IMPORT_ERROR = """
|
||||||
{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
|
{0} requires the SentencePiece library but it was not found in your environment. Check out the instructions on the
|
||||||
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
|
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
|
||||||
that match your environment. Please note that you may need to restart your runtime after installation.
|
that match your environment. Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
@@ -1500,7 +1500,7 @@ that match your environment. Please note that you may need to restart your runti
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
PROTOBUF_IMPORT_ERROR = """
|
PROTOBUF_IMPORT_ERROR = """
|
||||||
{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
|
{0} requires the protobuf library but it was not found in your environment. Check out the instructions on the
|
||||||
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
|
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
|
||||||
that match your environment. Please note that you may need to restart your runtime after installation.
|
that match your environment. Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
@@ -1508,7 +1508,7 @@ that match your environment. Please note that you may need to restart your runti
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
FAISS_IMPORT_ERROR = """
|
FAISS_IMPORT_ERROR = """
|
||||||
{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
|
{0} requires the faiss library but it was not found in your environment. Check out the instructions on the
|
||||||
installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
|
installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
|
||||||
that match your environment. Please note that you may need to restart your runtime after installation.
|
that match your environment. Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
@@ -1516,7 +1516,7 @@ that match your environment. Please note that you may need to restart your runti
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
PYTORCH_IMPORT_ERROR = """
|
PYTORCH_IMPORT_ERROR = """
|
||||||
{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
|
{0} requires the PyTorch library but it was not found in your environment. Check out the instructions on the
|
||||||
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
|
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
|
||||||
Please note that you may need to restart your runtime after installation.
|
Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
@@ -1524,7 +1524,7 @@ Please note that you may need to restart your runtime after installation.
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
TORCHVISION_IMPORT_ERROR = """
|
TORCHVISION_IMPORT_ERROR = """
|
||||||
{0} requires the Torchvision library but it was not found in your environment. Checkout the instructions on the
|
{0} requires the Torchvision library but it was not found in your environment. Check out the instructions on the
|
||||||
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
|
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
|
||||||
Please note that you may need to restart your runtime after installation.
|
Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
@@ -1576,7 +1576,7 @@ Please note that you may need to restart your runtime after installation.
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
TENSORFLOW_IMPORT_ERROR = """
|
TENSORFLOW_IMPORT_ERROR = """
|
||||||
{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
|
{0} requires the TensorFlow library but it was not found in your environment. Check out the instructions on the
|
||||||
installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
|
installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
|
||||||
Please note that you may need to restart your runtime after installation.
|
Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
@@ -1584,7 +1584,7 @@ Please note that you may need to restart your runtime after installation.
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
DETECTRON2_IMPORT_ERROR = """
|
DETECTRON2_IMPORT_ERROR = """
|
||||||
{0} requires the detectron2 library but it was not found in your environment. Checkout the instructions on the
|
{0} requires the detectron2 library but it was not found in your environment. Check out the instructions on the
|
||||||
installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones
|
installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones
|
||||||
that match your environment. Please note that you may need to restart your runtime after installation.
|
that match your environment. Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
@@ -1592,14 +1592,14 @@ that match your environment. Please note that you may need to restart your runti
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
FLAX_IMPORT_ERROR = """
|
FLAX_IMPORT_ERROR = """
|
||||||
{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
|
{0} requires the FLAX library but it was not found in your environment. Check out the instructions on the
|
||||||
installation page: https://github.com/google/flax and follow the ones that match your environment.
|
installation page: https://github.com/google/flax and follow the ones that match your environment.
|
||||||
Please note that you may need to restart your runtime after installation.
|
Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
FTFY_IMPORT_ERROR = """
|
FTFY_IMPORT_ERROR = """
|
||||||
{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the
|
{0} requires the ftfy library but it was not found in your environment. Check out the instructions on the
|
||||||
installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
|
installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
|
||||||
that match your environment. Please note that you may need to restart your runtime after installation.
|
that match your environment. Please note that you may need to restart your runtime after installation.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -19,5 +19,5 @@ limitations under the License.
|
|||||||
This page has been updated in light of the removal of the `add_new_model` script in favor of the more complete
|
This page has been updated in light of the removal of the `add_new_model` script in favor of the more complete
|
||||||
`add_new_model_like` script.
|
`add_new_model_like` script.
|
||||||
|
|
||||||
We recommend you checkout the documentation of [How to add a model](https://huggingface.co/docs/transformers/main/en/add_new_model)
|
We recommend you check out the documentation on [how to add a model](https://huggingface.co/docs/transformers/main/en/add_new_model)
|
||||||
in the Hugging Face Transformers documentation for complete and up-to-date instructions.
|
for complete and up-to-date instructions.
|
||||||
|
|||||||
Reference in New Issue
Block a user