diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 17f86e1667..ecf34bbf5e 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -102,6 +102,20 @@ GGUF_CONFIG_MAPPING = { "attention.layer_norm_rms_epsilon": "rms_norm_eps", "vocab_size": "vocab_size", }, + "qwen3moe": { + "context_length": "max_position_embeddings", + "block_count": "num_hidden_layers", + "feed_forward_length": "intermediate_size", + "embedding_length": "hidden_size", + "rope.dimension_count": None, + "rope.freq_base": "rope_theta", + "attention.head_count": "num_attention_heads", + "attention.head_count_kv": "num_key_value_heads", + "attention.layer_norm_rms_epsilon": "rms_norm_eps", + "vocab_size": "vocab_size", + "expert_count": "num_experts", + "expert_used_count": "num_experts_per_tok", + }, "falcon": { "context_length": "max_position_embeddings", "block_count": "num_hidden_layers", @@ -689,6 +703,7 @@ GGUF_TO_FAST_CONVERTERS = { "qwen2": GGUFQwen2Converter, "qwen2_moe": GGUFQwen2Converter, "qwen3": GGUFQwen2Converter, + "qwen3_moe": GGUFQwen2Converter, "phi3": GGUFPhi3Converter, "bloom": GGUFGPTConverter, "falcon": GGUFGPTConverter, diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index d2ed7f7a74..408ab47a62 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -302,6 +302,7 @@ class GgufModelTests(unittest.TestCase): gemma3_text_model_id = "unsloth/gemma-3-1b-it-GGUF" gemma3_vision_model_id = "unsloth/gemma-3-4b-it-GGUF" qwen3_model_id = "Qwen/Qwen3-0.6B-GGUF" + qwen3moe_model_id = "Qwen/Qwen3-30B-A3B-GGUF" q4_0_phi3_model_id = "Phi-3-mini-4k-instruct-q4.gguf" q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf" @@ -335,6 +336,7 @@ class GgufModelTests(unittest.TestCase): bf16_gemma3_text_model_id = "gemma-3-1b-it-BF16.gguf" bf16_gemma3_vision_model_id = "gemma-3-4b-it-BF16.gguf" q8_0_qwen3_model_id = "Qwen3-0.6B-Q8_0.gguf" + q4_k_m_qwen3moe_model_id = "Qwen3-30B-A3B-Q4_K_M.gguf" example_text = "Hello" @@ -973,3 +975,17 @@ class GgufModelTests(unittest.TestCase): EXPECTED_TEXT = "HelloED\nI need to find the value of the" self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + + def test_qwen3moe_q4_k_m(self): + tokenizer = AutoTokenizer.from_pretrained(self.qwen3moe_model_id, gguf_file=self.q4_k_m_qwen3moe_model_id) + model = AutoModelForCausalLM.from_pretrained( + self.qwen3moe_model_id, + gguf_file=self.q4_k_m_qwen3moe_model_id, + torch_dtype=torch.float16, + ) + + text = tokenizer(self.example_text, return_tensors="pt") + out = model.generate(**text, max_new_tokens=10) + + EXPECTED_TEXT = "Hello, I am a 20 year old male" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)