From 2b79f143756bf10a2d43e5ab6d413374b633d2ce Mon Sep 17 00:00:00 2001 From: 44670 <44670@users.noreply.github.com> Date: Tue, 15 Jul 2025 17:53:41 +0800 Subject: [PATCH] support loading qwen3 gguf (#38645) * support loading qwen3 gguf * Add qwen3 into GGUF_TO_FAST_CONVERTERS for tokenizer conversion * Add testcase * Fix formatting --- src/transformers/integrations/ggml.py | 13 +++++++++++++ tests/quantization/ggml/test_ggml.py | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 51bdc88608..17f86e1667 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -90,6 +90,18 @@ GGUF_CONFIG_MAPPING = { "expert_count": "num_experts", "expert_used_count": "num_experts_per_tok", }, + "qwen3": { + "context_length": "max_position_embeddings", + "block_count": "num_hidden_layers", + "feed_forward_length": "intermediate_size", + "embedding_length": "hidden_size", + "rope.dimension_count": None, + "rope.freq_base": "rope_theta", + "attention.head_count": "num_attention_heads", + "attention.head_count_kv": "num_key_value_heads", + "attention.layer_norm_rms_epsilon": "rms_norm_eps", + "vocab_size": "vocab_size", + }, "falcon": { "context_length": "max_position_embeddings", "block_count": "num_hidden_layers", @@ -676,6 +688,7 @@ GGUF_TO_FAST_CONVERTERS = { "llama": GGUFLlamaConverter, "qwen2": GGUFQwen2Converter, "qwen2_moe": GGUFQwen2Converter, + "qwen3": GGUFQwen2Converter, "phi3": GGUFPhi3Converter, "bloom": GGUFGPTConverter, "falcon": GGUFGPTConverter, diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index 6a0321f065..d2ed7f7a74 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -301,6 +301,7 @@ class GgufModelTests(unittest.TestCase): gemma3_qat_model_id = "google/gemma-3-1b-it-qat-q4_0-gguf" gemma3_text_model_id = "unsloth/gemma-3-1b-it-GGUF" gemma3_vision_model_id = "unsloth/gemma-3-4b-it-GGUF" + qwen3_model_id = "Qwen/Qwen3-0.6B-GGUF" q4_0_phi3_model_id = "Phi-3-mini-4k-instruct-q4.gguf" q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf" @@ -333,6 +334,7 @@ class GgufModelTests(unittest.TestCase): q4_0_gemma3_qat_model_id = "gemma-3-1b-it-q4_0.gguf" bf16_gemma3_text_model_id = "gemma-3-1b-it-BF16.gguf" bf16_gemma3_vision_model_id = "gemma-3-4b-it-BF16.gguf" + q8_0_qwen3_model_id = "Qwen3-0.6B-Q8_0.gguf" example_text = "Hello" @@ -955,3 +957,19 @@ class GgufModelTests(unittest.TestCase): torch.testing.assert_close(original_params, converted_state_dict[layer_name]) else: raise ValueError(f"Layer {layer_name} is not presented in GGUF model") + + @require_read_token + @unittest.skipUnless(is_gguf_available("0.16.0"), "test requires gguf version >= 0.16.0") + def test_qwen3_q8_0(self): + tokenizer = AutoTokenizer.from_pretrained(self.qwen3_model_id, gguf_file=self.q8_0_qwen3_model_id) + model = AutoModelForCausalLM.from_pretrained( + self.qwen3_model_id, + gguf_file=self.q8_0_qwen3_model_id, + torch_dtype=torch.float16, + ) + + text = tokenizer(self.example_text, return_tensors="pt")["input_ids"] + out = model.generate(text, max_new_tokens=10) + + EXPECTED_TEXT = "HelloED\nI need to find the value of the" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)