From 584eeb5387193d352da976cc3d1305f5c3404850 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 24 Aug 2023 14:57:16 +0200
Subject: [PATCH] [`AutoGPTQ`] Add correct installation of GPTQ library + fix
 slow tests (#25713)

* add correct installation of GPTQ library

* update tests values
---
 docker/transformers-all-latest-gpu/Dockerfile | 2 +-
 tests/quantization/gptq/test_gptq.py          | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index c96b4cc79b..a6c672e1a9 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -50,7 +50,7 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef
 RUN python3 -m pip install --no-cache-dir bitsandbytes
 
 # Add auto-gptq for gtpq quantization testing
-RUN python3 -m pip install --no-cache-dir auto-gptq 
+RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 
 # Add einops for additional model testing
 RUN python3 -m pip install --no-cache-dir einops
diff --git a/tests/quantization/gptq/test_gptq.py b/tests/quantization/gptq/test_gptq.py
index 257c6f020d..c7530471fa 100644
--- a/tests/quantization/gptq/test_gptq.py
+++ b/tests/quantization/gptq/test_gptq.py
@@ -87,7 +87,8 @@ class GPTQTest(unittest.TestCase):
     EXPECTED_OUTPUTS = set()
     EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
     EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.")
-    EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a professional photographer")
+    EXPECTED_OUTPUTS.add("Hello my name is Alyson, I am a student in the")
+    EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a very sweet,")
 
     # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
     EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
@@ -215,7 +216,7 @@ class GPTQTest(unittest.TestCase):
                 self.assertEqual(self.quantized_model.config.quantization_config.disable_exllama, True)
                 # we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
                 quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
-                    tmpdirname, quantization_config=GPTQConfig(disable_exllama=False, bits=6), device_map={"": 0}
+                    tmpdirname, quantization_config=GPTQConfig(disable_exllama=False, bits=4), device_map={"": 0}
                 )
                 self.assertEqual(quantized_model_from_saved.config.quantization_config.disable_exllama, False)
                 self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits)