diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index bd2354aec7..e3b2209f02 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -4673,6 +4673,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] if hf_quantizer is not None: missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix) + unexpected_keys = hf_quantizer.update_unexpected_keys(model, unexpected_keys, prefix) # retrieve weights on meta device and put them back on CPU. # This is not ideal in terms of memory, but if we don't do that not, we can't initialize them in the next step @@ -4993,6 +4994,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix load_offloaded_weights(model_to_load, state_dict_index, state_dict_folder) shutil.rmtree(state_dict_folder) + if hf_quantizer is not None: + missing_keys = hf_quantizer.update_missing_keys_after_loading(model_to_load, missing_keys, prefix) + if len(error_msgs) > 0: error_msg = "\n\t".join(error_msgs) if "size mismatch" in error_msg: diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py index d6303b2302..a1096c110d 100755 --- a/src/transformers/quantizers/base.py +++ b/src/transformers/quantizers/base.py @@ -109,6 +109,27 @@ class HfQuantizer(ABC): """ return missing_keys + def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]: + """ + Override this method if you want to adjust the `unexpected_keys`. + + Args: + unexpected_keys (`List[str]`, *optional*): + The list of unexpected keys in the checkpoint compared to the state dict of the model + """ + return unexpected_keys + + def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]: + """ + Override this method if you want to adjust the `missing_keys` after loading the model params, + but before the model is post-processed. + + Args: + missing_keys (`List[str]`, *optional*): + The list of missing keys in the checkpoint compared to the state dict of the model + """ + return missing_keys + def update_expected_keys(self, model, expected_keys: List[str], loaded_keys: List[str]) -> List[str]: """ Override this method if you want to adjust the `update_expected_keys`. diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py index 7d208087bb..3e65b103d5 100644 --- a/src/transformers/quantizers/quantizer_compressed_tensors.py +++ b/src/transformers/quantizers/quantizer_compressed_tensors.py @@ -14,6 +14,8 @@ import os +import re +from typing import List from ..utils import is_compressed_tensors_available, is_torch_available, logging from ..utils.quantization_config import CompressedTensorsConfig @@ -50,6 +52,45 @@ class CompressedTensorsHfQuantizer(HfQuantizer): self.run_compressed = quantization_config.run_compressed self.quantization_config = quantization_config + def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]: + """ + Update missing keys after loading the model. This is necessary for compressed tensors + to load the model correctly. We expect weights to be present in missing keys. + The weight's are re-constructed by ModelCompressor in _process_model_after_weight_loading + + This function cleans up expected missing keys and returns the remaining missing keys + """ + + if self.run_compressed: + return missing_keys + + # We expect some keys to be missing for + # compresed models + # This is fine as the weights are reconstructed by ModelCompressor + # in _process_model_after_weight_loading + + expected_missing_keys = self.compressor.get_missing_module_keys(model) + return [ + key for key in missing_keys if not any(re.match(f".*{pattern}", key) for pattern in expected_missing_keys) + ] + + def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]: + """ + Override this method if you want to adjust the `unexpected_keys`. + + Args: + unexpected_keys (`List[str]`, *optional*): + The list of unexpected keys in the checkpoint compared to the state dict of the model + """ + + if self.run_compressed: + return unexpected_keys + + # We expect some unexpected keys in model + # safetensors file for compressed models + keys_to_ignore = self.compressor.get_unexpected_file_keys(model) + return [key for key in unexpected_keys if not any(re.match(f".*{pattern}", key) for pattern in keys_to_ignore)] + def validate_environment(self, *args, **kwargs): if not is_compressed_tensors_available(): raise ImportError( @@ -75,9 +116,11 @@ class CompressedTensorsHfQuantizer(HfQuantizer): ct_quantization_config = self.compressor.quantization_config - if self.run_compressed and self.is_quantization_compressed: + if self.run_compressed: + if not self.is_quantization_compressed: + raise ValueError("`run_compressed` is only supported for quantized_compressed models") apply_quantization_config(model, ct_quantization_config, run_compressed=True) - elif not self.is_quantization_compressed: + elif self.is_quantized and not self.is_quantization_compressed: apply_quantization_config(model, ct_quantization_config) def _process_model_after_weight_loading(self, model, **kwargs): @@ -99,6 +142,12 @@ class CompressedTensorsHfQuantizer(HfQuantizer): self.compressor.quantization_config.quantization_status = QuantizationStatus.FROZEN self.compressor.decompress(model_path=cache_path, model=model) + @property + def is_quantized(self): + return self.quantization_config.quantization_config is not None and bool( + self.quantization_config.quantization_config.config_groups + ) + @property def is_quantization_compressed(self): from compressed_tensors.quantization import QuantizationStatus diff --git a/tests/quantization/compressed_tensor/test_load_sparse_model.py b/tests/quantization/compressed_tensor/test_load_sparse_model.py deleted file mode 100644 index 8992cd3d9b..0000000000 --- a/tests/quantization/compressed_tensor/test_load_sparse_model.py +++ /dev/null @@ -1,80 +0,0 @@ -import gc -import unittest - -from transformers import AutoModelForCausalLM -from transformers.testing_utils import require_compressed_tensors, require_torch -from transformers.utils import is_torch_available - - -if is_torch_available(): - import torch - - -@require_compressed_tensors -@require_torch -class CompressedTensorsTest(unittest.TestCase): - model_sparse_uncompressed = "horheynm/llama2.c_stories15M_pruned_50.2of4_uncompressed" - model_sparse_compressed = "horheynm/llama2.c_stories15M_pruned_50.2of4_compressed" - - prompt = "Paris is the capital of which country?" - - stubs = [model_sparse_uncompressed, model_sparse_compressed] - - def tearDown(self): - gc.collect() - torch.cuda.empty_cache() - gc.collect() - - def test_compressed_uncompressed_model_shapes(self): - """ - Check that the weights are the same between - uncompressed and compressed-decompressed model - Sparse compressed modules' weights are "packed" and shape/value will - differ - """ - - def _has_nested_attr(obj, attr_path): - attrs = attr_path.split(".") - for attr in attrs: - if not hasattr(obj, attr): - return None - obj = getattr(obj, attr) - return obj - - from compressed_tensors.quantization.utils import iter_named_leaf_modules - - uncompressed_model = AutoModelForCausalLM.from_pretrained( - self.model_sparse_uncompressed, - ) - - compressed_model_decompressed = AutoModelForCausalLM.from_pretrained( - self.model_sparse_compressed, - ) - - for name, submodule in iter_named_leaf_modules( - uncompressed_model, - ): - if comp_decomp_obj := _has_nested_attr(compressed_model_decompressed, name): - if hasattr(submodule, "weight"): - assert torch.equal(submodule.weight, comp_decomp_obj.weight) - - def test_run_compressed_outputs_match(self): - """Check that uncompressed and compressed-decompressed model outputs are the same""" - - from transformers import AutoTokenizer - - for stub in self.stubs: - tokenizer = AutoTokenizer.from_pretrained(stub) - input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids - - uncompressed_model = AutoModelForCausalLM.from_pretrained( - self.model_sparse_uncompressed, - ) - output_rc_true = uncompressed_model.generate(input_ids, max_new_tokens=100) - - compressed_model_decompressed = AutoModelForCausalLM.from_pretrained( - self.model_sparse_compressed, - ) - output_rc_false = compressed_model_decompressed.generate(input_ids, max_new_tokens=100) - - assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0]) diff --git a/tests/quantization/compressed_tensor/test_run_compressed_model.py b/tests/quantization/compressed_tensor/test_run_compressed_model.py deleted file mode 100644 index b168ca382c..0000000000 --- a/tests/quantization/compressed_tensor/test_run_compressed_model.py +++ /dev/null @@ -1,94 +0,0 @@ -import gc -import unittest - -from transformers import AutoModelForCausalLM -from transformers.testing_utils import require_compressed_tensors, require_torch -from transformers.utils import is_torch_available - - -if is_torch_available(): - import torch - - -@require_compressed_tensors -@require_torch -class CompressedTensorsTest(unittest.TestCase): - tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer" - tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer" - - prompt = "Paris is the capital of which country?" - - stubs = [tinyllama_w4a16, tinyllama_w8a8] - - def tearDown(self): - gc.collect() - torch.cuda.empty_cache() - gc.collect() - - def test_default_run_compressed__True(self): - from compressed_tensors.linear.compressed_linear import CompressedLinear - from compressed_tensors.quantization.utils import iter_named_leaf_modules - - for stub in self.stubs: - model = AutoModelForCausalLM.from_pretrained( - stub, - ) - compressed_linear_counts = 0 - - for _, submodule in iter_named_leaf_modules( - model, - ): - if isinstance(submodule, CompressedLinear): - compressed_linear_counts += 1 - - # some linear models are not compressed - ex. lm_head - assert compressed_linear_counts > 0 - - def test_default_run_compressed__False(self): - from compressed_tensors.linear.compressed_linear import CompressedLinear - from compressed_tensors.quantization.utils import iter_named_leaf_modules - - from transformers.utils.quantization_config import CompressedTensorsConfig - - quantization_config = CompressedTensorsConfig(run_compressed=False) - - for stub in self.stubs: - model = AutoModelForCausalLM.from_pretrained( - stub, - quantization_config=quantization_config, - ) - compressed_linear_counts = 0 - - for _, submodule in iter_named_leaf_modules( - model, - ): - if isinstance(submodule, CompressedLinear): - compressed_linear_counts += 1 - - # No modules should be CompressedLinear - assert compressed_linear_counts == 0 - - def test_run_compressed_outputs_match(self): - """Check that run_compressed=True/False output are the same""" - - from transformers import AutoTokenizer - from transformers.utils.quantization_config import CompressedTensorsConfig - - quantization_config = CompressedTensorsConfig(run_compressed=False) - - for stub in self.stubs: - tokenizer = AutoTokenizer.from_pretrained(stub) - input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids - - model_run_compressed__True = AutoModelForCausalLM.from_pretrained( - stub, - ) - output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100) - - model_run_compressed__False = AutoModelForCausalLM.from_pretrained( - stub, - quantization_config=quantization_config, - ) - output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100) - - assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0]) diff --git a/tests/quantization/compressed_tensor/__init__.py b/tests/quantization/compressed_tensors/__init__.py similarity index 100% rename from tests/quantization/compressed_tensor/__init__.py rename to tests/quantization/compressed_tensors/__init__.py diff --git a/tests/quantization/compressed_tensors/test_compressed_models.py b/tests/quantization/compressed_tensors/test_compressed_models.py new file mode 100644 index 0000000000..bc64f77ce9 --- /dev/null +++ b/tests/quantization/compressed_tensors/test_compressed_models.py @@ -0,0 +1,231 @@ +import gc +import unittest +import warnings + +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.testing_utils import require_compressed_tensors, require_torch +from transformers.utils import is_torch_available +from transformers.utils.quantization_config import CompressedTensorsConfig + + +if is_torch_available(): + import torch + + +@require_compressed_tensors +@require_torch +class StackCompressedModelTest(unittest.TestCase): + # Define stubs as class attributes + compressed_uncompressed_model_stubs = [ + ( + "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed", + "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed", + ), + ( + "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed", + "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed", + ), + ( + "nm-testing/llama2.c-stories42M-gsm8k-stacked-compressed", + "nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed", + ), + ] + # Flatten the list for tests that require a single list of stubs. + model_stubs = [stub for pair in compressed_uncompressed_model_stubs for stub in pair] + + # For the outputs matching test, use the sparse-only pair. + sparse_compressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed" + sparse_uncompressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed" + + prompt = "Paris is the capital of which country?" + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + gc.collect() + + def test_compressed_uncompressed_model_shapes(self): + """ + Verify that the weights of an uncompressed model and its decompressed compressed counterpart match. + Note: Weights for sparsely compressed models may differ due to packing. + """ + + def _has_nested_attr(obj, attr_path): + attrs = attr_path.split(".") + for attr in attrs: + if not hasattr(obj, attr): + return None + obj = getattr(obj, attr) + return obj + + from compressed_tensors.quantization.utils import iter_named_leaf_modules + + for compressed_model, uncompressed_model in self.compressed_uncompressed_model_stubs: + with self.subTest(compressed_model=compressed_model, uncompressed_model=uncompressed_model): + uncompressed = AutoModelForCausalLM.from_pretrained( + uncompressed_model, + device_map="auto", + torch_dtype="auto", + quantization_config=CompressedTensorsConfig(run_compressed=False), + ) + compressed_decompressed = AutoModelForCausalLM.from_pretrained( + compressed_model, + device_map="auto", + torch_dtype="auto", + quantization_config=CompressedTensorsConfig(run_compressed=False), + ) + + for name, submodule in iter_named_leaf_modules(uncompressed): + comp_decomp_obj = _has_nested_attr(compressed_decompressed, name) + if comp_decomp_obj is not None and hasattr(submodule, "weight"): + if "sparse-only" in uncompressed_model: + self.assertTrue( + torch.equal(submodule.weight, comp_decomp_obj.weight), + f"Weight mismatch for module '{name}' in sparse-only model.", + ) + else: + self.assertTrue( + torch.allclose(submodule.weight, comp_decomp_obj.weight, atol=0.2), + f"Weight mismatch for module '{name}' in quantized-only or stacked model.", + ) + + def test_outputs_match(self): + """ + Ensure that the generated outputs match between the uncompressed model + and its decompressed compressed counterpart. + """ + tokenizer = AutoTokenizer.from_pretrained(self.sparse_uncompressed_model) + input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids + + uncompressed = AutoModelForCausalLM.from_pretrained( + self.sparse_uncompressed_model, + device_map="auto", + torch_dtype="auto", + quantization_config=CompressedTensorsConfig(run_compressed=False), + ) + + output_uncompressed = uncompressed.generate(input_ids.to(uncompressed.device), max_new_tokens=100) + + decompressed = AutoModelForCausalLM.from_pretrained( + self.sparse_compressed_model, + device_map="auto", + torch_dtype="auto", + quantization_config=CompressedTensorsConfig(run_compressed=False), + ) + output_decompressed = decompressed.generate(input_ids.to(decompressed.device), max_new_tokens=100) + + self.assertEqual( + tokenizer.decode(output_uncompressed[0]), + tokenizer.decode(output_decompressed[0]), + "Generated outputs do not match between compressed and uncompressed models.", + ) + + def test_no_warnings_for_all_models(self): + """ + Confirm that loading any model using compressed tensors does not trigger + warnings about missing or unexpected keys. + """ + for model_stub in self.model_stubs: + with self.subTest(model_stub=model_stub): + with warnings.catch_warnings(record=True) as caught_warnings: + warnings.simplefilter("always") + AutoModelForCausalLM.from_pretrained( + model_stub, + device_map="auto", + torch_dtype="auto", + quantization_config=CompressedTensorsConfig(run_compressed=False), + ) + for warning in caught_warnings: + self.assertNotIn( + "missing keys", + str(warning.message).lower(), + f"'missing keys' found in warnings for model {model_stub}", + ) + self.assertNotIn( + "unexpected keys", + str(warning.message).lower(), + f"'unexpected keys' found in warnings for model {model_stub}", + ) + + +@require_compressed_tensors +@require_torch +class RunCompressedTest(unittest.TestCase): + tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer" + tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer" + + prompt = "Paris is the capital of which country?" + + stubs = [tinyllama_w4a16, tinyllama_w8a8] + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + gc.collect() + + def test_default_run_compressed__True(self): + from compressed_tensors.linear.compressed_linear import CompressedLinear + from compressed_tensors.quantization.utils import iter_named_leaf_modules + + for stub in self.stubs: + model = AutoModelForCausalLM.from_pretrained( + stub, + ) + compressed_linear_counts = 0 + + for _, submodule in iter_named_leaf_modules( + model, + ): + if isinstance(submodule, CompressedLinear): + compressed_linear_counts += 1 + + # some linear models are not compressed - ex. lm_head + assert compressed_linear_counts > 0 + + def test_default_run_compressed__False(self): + from compressed_tensors.linear.compressed_linear import CompressedLinear + from compressed_tensors.quantization.utils import iter_named_leaf_modules + from transformers.utils.quantization_config import CompressedTensorsConfig + + quantization_config = CompressedTensorsConfig(run_compressed=False) + + for stub in self.stubs: + model = AutoModelForCausalLM.from_pretrained( + stub, + quantization_config=quantization_config, + ) + compressed_linear_counts = 0 + + for _, submodule in iter_named_leaf_modules( + model, + ): + if isinstance(submodule, CompressedLinear): + compressed_linear_counts += 1 + + # No modules should be CompressedLinear + assert compressed_linear_counts == 0 + + def test_run_compressed_outputs_match(self): + """Check that run_compressed=True/False output are the same""" + + from transformers import AutoTokenizer + from transformers.utils.quantization_config import CompressedTensorsConfig + + quantization_config = CompressedTensorsConfig(run_compressed=False) + + for stub in self.stubs: + tokenizer = AutoTokenizer.from_pretrained(stub) + input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids + + model_run_compressed__True = AutoModelForCausalLM.from_pretrained( + stub, + ) + output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100) + + model_run_compressed__False = AutoModelForCausalLM.from_pretrained( + stub, + quantization_config=quantization_config, + ) + output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100) + + assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0]) diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensors/test_compressed_tensors.py similarity index 100% rename from tests/quantization/compressed_tensor/test_compressed_tensors.py rename to tests/quantization/compressed_tensors/test_compressed_tensors.py