From 658b849aeb0ae56de47aab259f76d3fe075075e5 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 24 May 2024 14:35:59 +0200
Subject: [PATCH] Quantization / TST: Fix remaining quantization tests (#31000)

* Fix remaining quant tests

* Update test_quanto.py
---
 docker/transformers-all-latest-gpu/Dockerfile          | 3 ---
 docker/transformers-quantization-latest-gpu/Dockerfile | 3 +++
 tests/quantization/quanto_integration/test_quanto.py   | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 930fdfb799..b888397f95 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -45,9 +45,6 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt
 # For video model testing
 RUN python3 -m pip install --no-cache-dir decord av==9.2.0
 
-# For GGUF tests
-RUN python3 -m pip install --no-cache-dir gguf
-
 # Some slow tests require bnb
 RUN python3 -m pip install --no-cache-dir bitsandbytes
 
diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
index 2b74dca91f..6d94dbee5a 100755
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -48,6 +48,9 @@ RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
 # Add hqq for quantization testing
 RUN python3 -m pip install --no-cache-dir hqq
 
+# For GGUF tests
+RUN python3 -m pip install --no-cache-dir gguf
+
 # Add autoawq for quantization testing
 # >=v0.2.3 needed for compatibility with torch 2.2.1
 RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl
diff --git a/tests/quantization/quanto_integration/test_quanto.py b/tests/quantization/quanto_integration/test_quanto.py
index f574478241..e662300a46 100644
--- a/tests/quantization/quanto_integration/test_quanto.py
+++ b/tests/quantization/quanto_integration/test_quanto.py
@@ -440,6 +440,7 @@ class QuantoQuantizationActivationTest(unittest.TestCase):
         self.assertIn("We don't support quantizing the activations with transformers library", str(e.exception))
 
 
+@require_quanto
 @require_torch_gpu
 class QuantoKVCacheQuantizationTest(unittest.TestCase):
     @slow
@@ -447,7 +448,7 @@ class QuantoKVCacheQuantizationTest(unittest.TestCase):
     def test_quantized_cache(self):
         EXPECTED_TEXT_COMPLETION = [
             "Simply put, the theory of relativity states that 1) the speed of light is the same for all observers, and 2) the laws of physics are the same for all observers.\nThe first part of the theory of relativity",
-            "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my burgers, my hot dogs, my sandwiches, my chicken, my pizza, my sal",
+            "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my burgers, my hot dogs, my sandwiches, my salads, my chicken, my fish",
         ]
 
         prompts = [