[CI] revert device in test_export_static_cache (#39662)

* revert device * add todo
2025-07-25 16:36:12 +01:00
parent 850bdeaa95
commit 5d0ba3e479
10 changed files with 10 additions and 10 deletions
--- a/tests/models/cohere2/test_modeling_cohere2.py
+++ b/tests/models/cohere2/test_modeling_cohere2.py
@@ -248,7 +248,7 @@ class Cohere2IntegrationTest(unittest.TestCase):

        tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token="<PAD>", padding_side="right")
        # Load model
-        device = torch_device
+        device = "cpu"  # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM
        dtype = torch.bfloat16
        cache_implementation = "static"
        attn_implementation = "sdpa"
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -423,7 +423,7 @@ class GemmaIntegrationTest(unittest.TestCase):
        ].shape[-1]

        # Load model
-        device = torch_device
+        device = "cpu"  # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM
        dtype = torch.bfloat16
        cache_implementation = "static"
        attn_implementation = "sdpa"
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@@ -335,7 +335,7 @@ class Gemma2IntegrationTest(unittest.TestCase):
        ].shape[-1]

        # Load model
-        device = torch_device
+        device = "cpu"  # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM
        dtype = torch.bfloat16
        cache_implementation = "static"
        attn_implementation = "sdpa"
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -322,7 +322,7 @@ class LlamaIntegrationTest(unittest.TestCase):
            ].shape[-1]

            # Load model
-            device = torch_device
+            device = "cpu"  # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM
            dtype = torch.bfloat16
            cache_implementation = "static"
            attn_implementation = "sdpa"
--- a/tests/models/olmo/test_modeling_olmo.py
+++ b/tests/models/olmo/test_modeling_olmo.py
@@ -347,7 +347,7 @@ class OlmoIntegrationTest(unittest.TestCase):
        ].shape[-1]

        # Load model
-        device = torch_device
+        device = "cpu"  # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM
        dtype = torch.bfloat16
        cache_implementation = "static"
        attn_implementation = "sdpa"
--- a/tests/models/olmo2/test_modeling_olmo2.py
+++ b/tests/models/olmo2/test_modeling_olmo2.py
@@ -348,7 +348,7 @@ class Olmo2IntegrationTest(unittest.TestCase):
        ].shape[-1]

        # Load model
-        device = torch_device
+        device = "cpu"  # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM
        dtype = torch.bfloat16
        cache_implementation = "static"
        attn_implementation = "sdpa"
--- a/tests/models/phi3/test_modeling_phi3.py
+++ b/tests/models/phi3/test_modeling_phi3.py
@@ -384,7 +384,7 @@ class Phi3IntegrationTest(unittest.TestCase):
            config.rope_scaling["type"] = "default"

        # Load model
-        device = torch_device
+        device = "cpu"  # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM
        dtype = torch.bfloat16
        cache_implementation = "static"
        attn_implementation = "sdpa"
--- a/tests/models/qwen2/test_modeling_qwen2.py
+++ b/tests/models/qwen2/test_modeling_qwen2.py
@@ -270,7 +270,7 @@ class Qwen2IntegrationTest(unittest.TestCase):
        ].shape[-1]

        # Load model
-        device = torch_device
+        device = "cpu"  # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM
        dtype = torch.bfloat16
        cache_implementation = "static"
        attn_implementation = "sdpa"
--- a/tests/models/qwen3/test_modeling_qwen3.py
+++ b/tests/models/qwen3/test_modeling_qwen3.py
@@ -261,7 +261,7 @@ class Qwen3IntegrationTest(unittest.TestCase):
        max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[
            "input_ids"
        ].shape[-1]
-        device = torch_device
+        device = "cpu"  # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM
        dtype = torch.bfloat16
        cache_implementation = "static"
        attn_implementation = "sdpa"
--- a/tests/models/smollm3/test_modeling_smollm3.py
+++ b/tests/models/smollm3/test_modeling_smollm3.py
@@ -191,7 +191,7 @@ class SmolLM3IntegrationTest(unittest.TestCase):
        ].shape[-1]

        # Load model
-        device = "cpu"
+        device = "cpu"  # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM
        dtype = torch.bfloat16
        cache_implementation = "static"
        attn_implementation = "sdpa"