HfQuantizer class for quantization-related stuff in modeling_utils.py (#26610)
* squashed earlier commits for easier rebase * rm rebase leftovers * 4bit save enabled @quantizers * TMP gptq test use exllama * fix AwqConfigTest::test_wrong_backend for A100 * quantizers AWQ fixes * _load_pretrained_model low_cpu_mem_usage branch * quantizers style * remove require_low_cpu_mem_usage attr * rm dtype arg from process_model_before_weight_loading * rm config_origin from Q-config * rm inspect from q_config * fixed docstrings in QuantizationConfigParser * logger.warning fix * mv is_loaded_in_4(8)bit to BnbHFQuantizer * is_accelerate_available error msg fix in quantizer * split is_model_trainable in bnb quantizer class * rm llm_int8_skip_modules as separate var in Q * Q rm todo * fwd ref to HFQuantizer in type hint * rm note re optimum.gptq.GPTQQuantizer * quantization_config in __init__ simplified * replaced NonImplemented with create_quantized_param * rm load_in_4/8_bit deprecation warning * QuantizationConfigParser refactoring * awq-related minor changes * awq-related changes * awq config.modules_to_not_convert * raise error if no q-method in q-config in args * minor cleanup * awq quantizer docstring * combine common parts in bnb process_model_before_weight_loading * revert test_gptq * .process_model_ cleanup * restore dict config warning * removed typevars in quantizers.py * cleanup post-rebase 16 jan * QuantizationConfigParser classmethod refactor * rework of handling of unexpected aux elements of bnb weights * moved q-related stuff from save_pretrained to quantizers * refactor v1 * more changes * fix some tests * remove it from main init * ooops * Apply suggestions from code review Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * fix awq issues * fix * fix * fix * fix * fix * fix * add docs * Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update docs/source/en/hf_quantizer.md * address comments * fix * fixup * Update src/transformers/modeling_utils.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/modeling_utils.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * address final comment * update * Update src/transformers/quantizers/base.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/quantizers/auto.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fix * add kwargs update * fixup * add `optimum_quantizer` attribute * oops * rm unneeded file * fix doctests --------- Co-authored-by: younesbelkada <younesbelkada@gmail.com> Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
@@ -95,7 +95,8 @@ class BaseMixedInt8Test(unittest.TestCase):
|
||||
)
|
||||
|
||||
input_text = "Hello my name is"
|
||||
EXPECTED_OUTPUT = "Hello my name is John.\nI am a friend of the family.\n"
|
||||
EXPECTED_OUTPUTS = set()
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of the family.\n")
|
||||
MAX_NEW_TOKENS = 10
|
||||
|
||||
def setUp(self):
|
||||
@@ -260,7 +261,7 @@ class MixedInt8Test(BaseMixedInt8Test):
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
output_sequences = self.model_8bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
|
||||
|
||||
self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_generate_quality_config(self):
|
||||
r"""
|
||||
@@ -278,7 +279,7 @@ class MixedInt8Test(BaseMixedInt8Test):
|
||||
input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10
|
||||
)
|
||||
|
||||
self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_raise_if_config_and_load_in_8bit(self):
|
||||
r"""
|
||||
@@ -365,9 +366,7 @@ class MixedInt8Test(BaseMixedInt8Test):
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
|
||||
|
||||
self.assertEqual(
|
||||
self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT
|
||||
)
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_int8_serialization_regression(self):
|
||||
r"""
|
||||
@@ -392,9 +391,7 @@ class MixedInt8Test(BaseMixedInt8Test):
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
|
||||
|
||||
self.assertEqual(
|
||||
self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT
|
||||
)
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_int8_serialization_sharded(self):
|
||||
r"""
|
||||
@@ -419,9 +416,7 @@ class MixedInt8Test(BaseMixedInt8Test):
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
|
||||
|
||||
self.assertEqual(
|
||||
self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT
|
||||
)
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_int8_from_pretrained(self):
|
||||
r"""
|
||||
@@ -441,7 +436,7 @@ class MixedInt8Test(BaseMixedInt8Test):
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
|
||||
|
||||
self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
|
||||
@require_bitsandbytes
|
||||
@@ -628,7 +623,7 @@ class MixedInt8TestPipeline(BaseMixedInt8Test):
|
||||
|
||||
# Real second forward pass
|
||||
pipeline_output = self.pipe(self.input_text)
|
||||
self.assertEqual(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUT)
|
||||
self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
|
||||
|
||||
|
||||
@require_torch_multi_gpu
|
||||
@@ -654,7 +649,7 @@ class MixedInt8TestMultiGpu(BaseMixedInt8Test):
|
||||
|
||||
# Second real batch
|
||||
output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
|
||||
self.assertEqual(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
|
||||
@require_torch_multi_gpu
|
||||
@@ -671,7 +666,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
|
||||
|
||||
# Get the generation
|
||||
output_text = self.tokenizer.decode(output_parallel[0], skip_special_tokens=True)
|
||||
self.assertEqual(output_text, self.EXPECTED_OUTPUT)
|
||||
self.assertIn(output_text, self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_cpu_gpu_loading_random_device_map(self):
|
||||
r"""
|
||||
@@ -708,7 +703,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
|
||||
"transformer.ln_f": 1,
|
||||
}
|
||||
|
||||
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
|
||||
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True, load_in_8bit=True)
|
||||
|
||||
model_8bit = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
@@ -734,7 +729,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
|
||||
"transformer.h": 0,
|
||||
"transformer.ln_f": 1,
|
||||
}
|
||||
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
|
||||
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True, load_in_8bit=True)
|
||||
|
||||
# Load model
|
||||
model_8bit = AutoModelForCausalLM.from_pretrained(
|
||||
@@ -760,7 +755,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
|
||||
"transformer.h": 1,
|
||||
"transformer.ln_f": "disk",
|
||||
}
|
||||
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
|
||||
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True, load_in_8bit=True)
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
# Load model
|
||||
model_8bit = AutoModelForCausalLM.from_pretrained(
|
||||
@@ -849,7 +844,9 @@ class MixedInt8TestTraining(BaseMixedInt8Test):
|
||||
class MixedInt8GPT2Test(MixedInt8Test):
|
||||
model_name = "gpt2-xl"
|
||||
EXPECTED_RELATIVE_DIFFERENCE = 1.8720077507258357
|
||||
EXPECTED_OUTPUT = "Hello my name is John Doe, and I'm a big fan of"
|
||||
EXPECTED_OUTPUTS = set()
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John Doe, and I'm a big fan of")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John Doe, and I'm a fan of the")
|
||||
|
||||
def test_int8_from_pretrained(self):
|
||||
r"""
|
||||
@@ -869,4 +866,4 @@ class MixedInt8GPT2Test(MixedInt8Test):
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
|
||||
|
||||
self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
Reference in New Issue
Block a user