Run model as compressed/uncompressed mode (#34719)

* draft, run model as compreszed/uncompressed mode * draft * run run_compressed=False * run_compressed as attr * set run_compressed=False using quantization_config * remove redundant line * make is_qat_trainable dependent on run_compressed status * add tests * lint * full in docstring * add decompress * comments * decompress if model is compresssed and not run_compressed * apply_quant_config logic fix -- populate statedict properly * comments * remove non compressed model * make is_compressed as property * cosmetic * run apply_quant_config for non-compressed models -- popualte scales and zeropoints * add pahtway for decompressing sparse models * typo on is_quantization_compressed * lint * fix typo
2024-12-13 02:23:31 -05:00
parent 31f9a289a6
commit e4e404fdd0
9 changed files with 250 additions and 18 deletions
--- a/tests/quantization/compressed_tensor/test_load_sparse_model.py
+++ b/tests/quantization/compressed_tensor/test_load_sparse_model.py
@@ -0,0 +1,80 @@
+import gc
+import unittest
+
+from transformers import AutoModelForCausalLM
+from transformers.testing_utils import require_compressed_tensors, require_torch
+from transformers.utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+@require_compressed_tensors
+@require_torch
+class CompressedTensorsTest(unittest.TestCase):
+    model_sparse_uncompressed = "horheynm/llama2.c_stories15M_pruned_50.2of4_uncompressed"
+    model_sparse_compressed = "horheynm/llama2.c_stories15M_pruned_50.2of4_compressed"
+
+    prompt = "Paris is the capital of which country?"
+
+    stubs = [model_sparse_uncompressed, model_sparse_compressed]
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def test_compressed_uncompressed_model_shapes(self):
+        """
+        Check that the weights are the same between
+         uncompressed and compressed-decompressed model
+        Sparse compressed modules' weights are "packed" and shape/value will
+         differ
+        """
+
+        def _has_nested_attr(obj, attr_path):
+            attrs = attr_path.split(".")
+            for attr in attrs:
+                if not hasattr(obj, attr):
+                    return None
+                obj = getattr(obj, attr)
+            return obj
+
+        from compressed_tensors.quantization.utils import iter_named_leaf_modules
+
+        uncompressed_model = AutoModelForCausalLM.from_pretrained(
+            self.model_sparse_uncompressed,
+        )
+
+        compressed_model_decompressed = AutoModelForCausalLM.from_pretrained(
+            self.model_sparse_compressed,
+        )
+
+        for name, submodule in iter_named_leaf_modules(
+            uncompressed_model,
+        ):
+            if comp_decomp_obj := _has_nested_attr(compressed_model_decompressed, name):
+                if hasattr(submodule, "weight"):
+                    assert torch.equal(submodule.weight, comp_decomp_obj.weight)
+
+    def test_run_compressed_outputs_match(self):
+        """Check that uncompressed and compressed-decompressed model outputs are the same"""
+
+        from transformers import AutoTokenizer
+
+        for stub in self.stubs:
+            tokenizer = AutoTokenizer.from_pretrained(stub)
+            input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
+
+            uncompressed_model = AutoModelForCausalLM.from_pretrained(
+                self.model_sparse_uncompressed,
+            )
+            output_rc_true = uncompressed_model.generate(input_ids, max_new_tokens=100)
+
+            compressed_model_decompressed = AutoModelForCausalLM.from_pretrained(
+                self.model_sparse_compressed,
+            )
+            output_rc_false = compressed_model_decompressed.generate(input_ids, max_new_tokens=100)
+
+            assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])
--- a/tests/quantization/compressed_tensor/test_run_compressed_model.py
+++ b/tests/quantization/compressed_tensor/test_run_compressed_model.py
@@ -0,0 +1,94 @@
+import gc
+import unittest
+
+from transformers import AutoModelForCausalLM
+from transformers.testing_utils import require_compressed_tensors, require_torch
+from transformers.utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+@require_compressed_tensors
+@require_torch
+class CompressedTensorsTest(unittest.TestCase):
+    tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
+    tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
+
+    prompt = "Paris is the capital of which country?"
+
+    stubs = [tinyllama_w4a16, tinyllama_w8a8]
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def test_default_run_compressed__True(self):
+        from compressed_tensors.linear.compressed_linear import CompressedLinear
+        from compressed_tensors.quantization.utils import iter_named_leaf_modules
+
+        for stub in self.stubs:
+            model = AutoModelForCausalLM.from_pretrained(
+                stub,
+            )
+            compressed_linear_counts = 0
+
+            for _, submodule in iter_named_leaf_modules(
+                model,
+            ):
+                if isinstance(submodule, CompressedLinear):
+                    compressed_linear_counts += 1
+
+            # some linear models are not compressed - ex. lm_head
+            assert compressed_linear_counts > 0
+
+    def test_default_run_compressed__False(self):
+        from compressed_tensors.linear.compressed_linear import CompressedLinear
+        from compressed_tensors.quantization.utils import iter_named_leaf_modules
+
+        from transformers.utils.quantization_config import CompressedTensorsConfig
+
+        quantization_config = CompressedTensorsConfig(run_compressed=False)
+
+        for stub in self.stubs:
+            model = AutoModelForCausalLM.from_pretrained(
+                stub,
+                quantization_config=quantization_config,
+            )
+            compressed_linear_counts = 0
+
+            for _, submodule in iter_named_leaf_modules(
+                model,
+            ):
+                if isinstance(submodule, CompressedLinear):
+                    compressed_linear_counts += 1
+
+            # No modules should be CompressedLinear
+            assert compressed_linear_counts == 0
+
+    def test_run_compressed_outputs_match(self):
+        """Check that run_compressed=True/False output are the same"""
+
+        from transformers import AutoTokenizer
+        from transformers.utils.quantization_config import CompressedTensorsConfig
+
+        quantization_config = CompressedTensorsConfig(run_compressed=False)
+
+        for stub in self.stubs:
+            tokenizer = AutoTokenizer.from_pretrained(stub)
+            input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
+
+            model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
+                stub,
+            )
+            output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
+
+            model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
+                stub,
+                quantization_config=quantization_config,
+            )
+            output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
+
+            assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])