HfQuantizer class for quantization-related stuff in modeling_utils.py (#26610)

* squashed earlier commits for easier rebase

* rm rebase leftovers

* 4bit save enabled @quantizers

* TMP gptq test use exllama

* fix AwqConfigTest::test_wrong_backend for A100

* quantizers AWQ fixes

* _load_pretrained_model low_cpu_mem_usage branch

* quantizers style

* remove require_low_cpu_mem_usage attr

* rm dtype arg from process_model_before_weight_loading

* rm config_origin from Q-config

* rm inspect from q_config

* fixed docstrings in QuantizationConfigParser

* logger.warning fix

* mv is_loaded_in_4(8)bit to BnbHFQuantizer

* is_accelerate_available error msg fix in quantizer

* split is_model_trainable in bnb quantizer class

* rm llm_int8_skip_modules as separate var in Q

* Q rm todo

* fwd ref to HFQuantizer in type hint

* rm note re optimum.gptq.GPTQQuantizer

* quantization_config in __init__ simplified

* replaced NonImplemented with  create_quantized_param

* rm load_in_4/8_bit deprecation warning

* QuantizationConfigParser refactoring

* awq-related minor changes

* awq-related changes

* awq config.modules_to_not_convert

* raise error if no q-method in q-config in args

* minor cleanup

* awq quantizer docstring

* combine common parts in bnb process_model_before_weight_loading

* revert test_gptq

* .process_model_ cleanup

* restore dict config warning

* removed typevars in quantizers.py

* cleanup post-rebase 16 jan

* QuantizationConfigParser classmethod refactor

* rework of handling of unexpected aux elements of bnb weights

* moved q-related stuff from save_pretrained to quantizers

* refactor v1

* more changes

* fix some tests

* remove it from main init

* ooops

* Apply suggestions from code review

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* fix awq issues

* fix

* fix

* fix

* fix

* fix

* fix

* add docs

* Apply suggestions from code review

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update docs/source/en/hf_quantizer.md

* address comments

* fix

* fixup

* Update src/transformers/modeling_utils.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/modeling_utils.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* address final comment

* update

* Update src/transformers/quantizers/base.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/quantizers/auto.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* fix

* add kwargs update

* fixup

* add `optimum_quantizer` attribute

* oops

* rm unneeded file

* fix doctests

---------

Co-authored-by: younesbelkada <younesbelkada@gmail.com>
Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Poedator
2024-01-30 04:48:25 +03:00
committed by GitHub
parent 1f5590d32e
commit d78e78a0e4
18 changed files with 1443 additions and 487 deletions

View File

@@ -55,8 +55,15 @@ class AwqConfigTest(unittest.TestCase):
with self.assertRaises(ValueError):
AwqConfig(bits=4, backend="unexisting-backend")
# LLMAWQ does not work on a T4
with self.assertRaises(ValueError):
compute_capability = torch.cuda.get_device_capability()
major, minor = compute_capability
if major < 8:
# LLMAWQ does not work on a T4
with self.assertRaises(ValueError):
AwqConfig(bits=4, backend="llm-awq")
else:
# LLMAWQ should work on an A100
AwqConfig(bits=4, backend="llm-awq")
def test_to_dict(self):

View File

@@ -95,7 +95,8 @@ class BaseMixedInt8Test(unittest.TestCase):
)
input_text = "Hello my name is"
EXPECTED_OUTPUT = "Hello my name is John.\nI am a friend of the family.\n"
EXPECTED_OUTPUTS = set()
EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of the family.\n")
MAX_NEW_TOKENS = 10
def setUp(self):
@@ -260,7 +261,7 @@ class MixedInt8Test(BaseMixedInt8Test):
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = self.model_8bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_generate_quality_config(self):
r"""
@@ -278,7 +279,7 @@ class MixedInt8Test(BaseMixedInt8Test):
input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10
)
self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_raise_if_config_and_load_in_8bit(self):
r"""
@@ -365,9 +366,7 @@ class MixedInt8Test(BaseMixedInt8Test):
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
self.assertEqual(
self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT
)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_int8_serialization_regression(self):
r"""
@@ -392,9 +391,7 @@ class MixedInt8Test(BaseMixedInt8Test):
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
self.assertEqual(
self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT
)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_int8_serialization_sharded(self):
r"""
@@ -419,9 +416,7 @@ class MixedInt8Test(BaseMixedInt8Test):
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model_from_saved.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
self.assertEqual(
self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT
)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_int8_from_pretrained(self):
r"""
@@ -441,7 +436,7 @@ class MixedInt8Test(BaseMixedInt8Test):
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@require_bitsandbytes
@@ -628,7 +623,7 @@ class MixedInt8TestPipeline(BaseMixedInt8Test):
# Real second forward pass
pipeline_output = self.pipe(self.input_text)
self.assertEqual(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUT)
self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
@require_torch_multi_gpu
@@ -654,7 +649,7 @@ class MixedInt8TestMultiGpu(BaseMixedInt8Test):
# Second real batch
output_parallel = model_parallel.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
self.assertEqual(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@require_torch_multi_gpu
@@ -671,7 +666,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
# Get the generation
output_text = self.tokenizer.decode(output_parallel[0], skip_special_tokens=True)
self.assertEqual(output_text, self.EXPECTED_OUTPUT)
self.assertIn(output_text, self.EXPECTED_OUTPUTS)
def test_cpu_gpu_loading_random_device_map(self):
r"""
@@ -708,7 +703,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
"transformer.ln_f": 1,
}
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True, load_in_8bit=True)
model_8bit = AutoModelForCausalLM.from_pretrained(
self.model_name,
@@ -734,7 +729,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
"transformer.h": 0,
"transformer.ln_f": 1,
}
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True, load_in_8bit=True)
# Load model
model_8bit = AutoModelForCausalLM.from_pretrained(
@@ -760,7 +755,7 @@ class MixedInt8TestCpuGpu(BaseMixedInt8Test):
"transformer.h": 1,
"transformer.ln_f": "disk",
}
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
bnb_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True, load_in_8bit=True)
with tempfile.TemporaryDirectory() as tmpdirname:
# Load model
model_8bit = AutoModelForCausalLM.from_pretrained(
@@ -849,7 +844,9 @@ class MixedInt8TestTraining(BaseMixedInt8Test):
class MixedInt8GPT2Test(MixedInt8Test):
model_name = "gpt2-xl"
EXPECTED_RELATIVE_DIFFERENCE = 1.8720077507258357
EXPECTED_OUTPUT = "Hello my name is John Doe, and I'm a big fan of"
EXPECTED_OUTPUTS = set()
EXPECTED_OUTPUTS.add("Hello my name is John Doe, and I'm a big fan of")
EXPECTED_OUTPUTS.add("Hello my name is John Doe, and I'm a fan of the")
def test_int8_from_pretrained(self):
r"""
@@ -869,4 +866,4 @@ class MixedInt8GPT2Test(MixedInt8Test):
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
self.assertEqual(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)