Run model as compressed/uncompressed mode (#34719)

* draft, run model as compreszed/uncompressed mode

* draft

* run run_compressed=False

* run_compressed as attr

* set run_compressed=False using quantization_config

* remove redundant line

* make is_qat_trainable dependent on run_compressed status

* add tests

* lint

* full in docstring

* add decompress

* comments

* decompress if model is compresssed and not run_compressed

* apply_quant_config logic fix -- populate statedict properly

* comments

* remove non  compressed model

* make is_compressed as property

* cosmetic

* run apply_quant_config for non-compressed models -- popualte scales and zeropoints

* add pahtway for decompressing sparse models

* typo on is_quantization_compressed

* lint

* fix typo
This commit is contained in:
George
2024-12-13 02:23:31 -05:00
committed by GitHub
parent 31f9a289a6
commit e4e404fdd0
9 changed files with 250 additions and 18 deletions

View File

@@ -0,0 +1,80 @@
import gc
import unittest
from transformers import AutoModelForCausalLM
from transformers.testing_utils import require_compressed_tensors, require_torch
from transformers.utils import is_torch_available
if is_torch_available():
import torch
@require_compressed_tensors
@require_torch
class CompressedTensorsTest(unittest.TestCase):
model_sparse_uncompressed = "horheynm/llama2.c_stories15M_pruned_50.2of4_uncompressed"
model_sparse_compressed = "horheynm/llama2.c_stories15M_pruned_50.2of4_compressed"
prompt = "Paris is the capital of which country?"
stubs = [model_sparse_uncompressed, model_sparse_compressed]
def tearDown(self):
gc.collect()
torch.cuda.empty_cache()
gc.collect()
def test_compressed_uncompressed_model_shapes(self):
"""
Check that the weights are the same between
uncompressed and compressed-decompressed model
Sparse compressed modules' weights are "packed" and shape/value will
differ
"""
def _has_nested_attr(obj, attr_path):
attrs = attr_path.split(".")
for attr in attrs:
if not hasattr(obj, attr):
return None
obj = getattr(obj, attr)
return obj
from compressed_tensors.quantization.utils import iter_named_leaf_modules
uncompressed_model = AutoModelForCausalLM.from_pretrained(
self.model_sparse_uncompressed,
)
compressed_model_decompressed = AutoModelForCausalLM.from_pretrained(
self.model_sparse_compressed,
)
for name, submodule in iter_named_leaf_modules(
uncompressed_model,
):
if comp_decomp_obj := _has_nested_attr(compressed_model_decompressed, name):
if hasattr(submodule, "weight"):
assert torch.equal(submodule.weight, comp_decomp_obj.weight)
def test_run_compressed_outputs_match(self):
"""Check that uncompressed and compressed-decompressed model outputs are the same"""
from transformers import AutoTokenizer
for stub in self.stubs:
tokenizer = AutoTokenizer.from_pretrained(stub)
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
uncompressed_model = AutoModelForCausalLM.from_pretrained(
self.model_sparse_uncompressed,
)
output_rc_true = uncompressed_model.generate(input_ids, max_new_tokens=100)
compressed_model_decompressed = AutoModelForCausalLM.from_pretrained(
self.model_sparse_compressed,
)
output_rc_false = compressed_model_decompressed.generate(input_ids, max_new_tokens=100)
assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])

View File

@@ -0,0 +1,94 @@
import gc
import unittest
from transformers import AutoModelForCausalLM
from transformers.testing_utils import require_compressed_tensors, require_torch
from transformers.utils import is_torch_available
if is_torch_available():
import torch
@require_compressed_tensors
@require_torch
class CompressedTensorsTest(unittest.TestCase):
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
prompt = "Paris is the capital of which country?"
stubs = [tinyllama_w4a16, tinyllama_w8a8]
def tearDown(self):
gc.collect()
torch.cuda.empty_cache()
gc.collect()
def test_default_run_compressed__True(self):
from compressed_tensors.linear.compressed_linear import CompressedLinear
from compressed_tensors.quantization.utils import iter_named_leaf_modules
for stub in self.stubs:
model = AutoModelForCausalLM.from_pretrained(
stub,
)
compressed_linear_counts = 0
for _, submodule in iter_named_leaf_modules(
model,
):
if isinstance(submodule, CompressedLinear):
compressed_linear_counts += 1
# some linear models are not compressed - ex. lm_head
assert compressed_linear_counts > 0
def test_default_run_compressed__False(self):
from compressed_tensors.linear.compressed_linear import CompressedLinear
from compressed_tensors.quantization.utils import iter_named_leaf_modules
from transformers.utils.quantization_config import CompressedTensorsConfig
quantization_config = CompressedTensorsConfig(run_compressed=False)
for stub in self.stubs:
model = AutoModelForCausalLM.from_pretrained(
stub,
quantization_config=quantization_config,
)
compressed_linear_counts = 0
for _, submodule in iter_named_leaf_modules(
model,
):
if isinstance(submodule, CompressedLinear):
compressed_linear_counts += 1
# No modules should be CompressedLinear
assert compressed_linear_counts == 0
def test_run_compressed_outputs_match(self):
"""Check that run_compressed=True/False output are the same"""
from transformers import AutoTokenizer
from transformers.utils.quantization_config import CompressedTensorsConfig
quantization_config = CompressedTensorsConfig(run_compressed=False)
for stub in self.stubs:
tokenizer = AutoTokenizer.from_pretrained(stub)
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
stub,
)
output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
stub,
quantization_config=quantization_config,
)
output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])