From 99eb9b523f9b9ea6096323ce5610ce6633acc88a Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Thu, 21 Jul 2022 14:44:57 -0400
Subject: [PATCH] Fix `no_trainer` CI (#18242)

* Fix all tests
---
 examples/pytorch/test_accelerate_examples.py | 35 ++++++++++++--------
 src/transformers/testing_utils.py            | 25 +++++++++++++-
 2 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py
index b4f3157d78..99a8b0db84 100644
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@@ -19,14 +19,14 @@ import json
 import logging
 import os
 import shutil
-import subprocess
 import sys
 import tempfile
+from unittest import mock
 
 import torch
 
 from accelerate.utils import write_basic_config
-from transformers.testing_utils import TestCasePlus, get_gpu_count, slow, torch_device
+from transformers.testing_utils import TestCasePlus, get_gpu_count, run_command, slow, torch_device
 from transformers.utils import is_apex_available
 
 
@@ -75,6 +75,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdir)
 
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_glue_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -94,12 +95,13 @@ class ExamplesTestsNoTrainer(TestCasePlus):
         if is_cuda_and_apex_available():
             testargs.append("--fp16")
 
-        _ = subprocess.run(self._launch_args + testargs, stdout=subprocess.PIPE)
+        run_command(self._launch_args + testargs)
         result = get_results(tmp_dir)
         self.assertGreaterEqual(result["eval_accuracy"], 0.75)
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "glue_no_trainer")))
 
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_clm_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -120,12 +122,13 @@ class ExamplesTestsNoTrainer(TestCasePlus):
             # Skipping because there are not enough batches to train the model + would need a drop_last to work.
             return
 
-        _ = subprocess.run(self._launch_args + testargs, stdout=subprocess.PIPE)
+        run_command(self._launch_args + testargs)
         result = get_results(tmp_dir)
         self.assertLess(result["perplexity"], 100)
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "clm_no_trainer")))
 
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_mlm_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -139,12 +142,13 @@ class ExamplesTestsNoTrainer(TestCasePlus):
             --with_tracking
         """.split()
 
-        _ = subprocess.run(self._launch_args + testargs, stdout=subprocess.PIPE)
+        run_command(self._launch_args + testargs)
         result = get_results(tmp_dir)
         self.assertLess(result["perplexity"], 42)
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "mlm_no_trainer")))
 
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_ner_no_trainer(self):
         # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
         epochs = 7 if get_gpu_count() > 1 else 2
@@ -165,13 +169,14 @@ class ExamplesTestsNoTrainer(TestCasePlus):
             --with_tracking
         """.split()
 
-        _ = subprocess.run(self._launch_args + testargs, stdout=subprocess.PIPE)
+        run_command(self._launch_args + testargs)
         result = get_results(tmp_dir)
         self.assertGreaterEqual(result["eval_accuracy"], 0.75)
         self.assertLess(result["train_loss"], 0.5)
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "ner_no_trainer")))
 
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_squad_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -190,7 +195,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
             --with_tracking
         """.split()
 
-        _ = subprocess.run(self._launch_args + testargs, stdout=subprocess.PIPE)
+        run_command(self._launch_args + testargs)
         result = get_results(tmp_dir)
         # Because we use --version_2_with_negative the testing script uses SQuAD v2 metrics.
         self.assertGreaterEqual(result["eval_f1"], 28)
@@ -198,6 +203,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "qa_no_trainer")))
 
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_swag_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -214,12 +220,13 @@ class ExamplesTestsNoTrainer(TestCasePlus):
             --with_tracking
         """.split()
 
-        _ = subprocess.run(self._launch_args + testargs, stdout=subprocess.PIPE)
+        run_command(self._launch_args + testargs)
         result = get_results(tmp_dir)
         self.assertGreaterEqual(result["eval_accuracy"], 0.8)
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "swag_no_trainer")))
 
     @slow
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_summarization_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -237,7 +244,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
             --with_tracking
         """.split()
 
-        _ = subprocess.run(self._launch_args + testargs, stdout=subprocess.PIPE)
+        run_command(self._launch_args + testargs)
         result = get_results(tmp_dir)
         self.assertGreaterEqual(result["eval_rouge1"], 10)
         self.assertGreaterEqual(result["eval_rouge2"], 2)
@@ -247,6 +254,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "summarization_no_trainer")))
 
     @slow
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_translation_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -268,7 +276,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
             --with_tracking
         """.split()
 
-        _ = subprocess.run(self._launch_args + testargs, stdout=subprocess.PIPE)
+        run_command(self._launch_args + testargs)
         result = get_results(tmp_dir)
         self.assertGreaterEqual(result["eval_bleu"], 30)
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
@@ -292,10 +300,11 @@ class ExamplesTestsNoTrainer(TestCasePlus):
             --checkpointing_steps epoch
         """.split()
 
-        _ = subprocess.run(self._launch_args + testargs, stdout=subprocess.PIPE)
+        run_command(self._launch_args + testargs)
         result = get_results(tmp_dir)
         self.assertGreaterEqual(result["eval_overall_accuracy"], 0.10)
 
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
     def test_run_image_classification_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -316,9 +325,9 @@ class ExamplesTestsNoTrainer(TestCasePlus):
         if is_cuda_and_apex_available():
             testargs.append("--fp16")
 
-        _ = subprocess.run(self._launch_args + testargs, stdout=subprocess.PIPE)
+        run_command(self._launch_args + testargs)
         result = get_results(tmp_dir)
         # The base model scores a 25%
-        self.assertGreaterEqual(result["eval_accuracy"], 0.625)
+        self.assertGreaterEqual(result["eval_accuracy"], 0.6)
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "step_1")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "image_classification_no_trainer")))
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index bd83529f9c..9360347238 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -20,6 +20,7 @@ import os
 import re
 import shlex
 import shutil
+import subprocess
 import sys
 import tempfile
 import unittest
@@ -27,7 +28,7 @@ from collections.abc import Mapping
 from distutils.util import strtobool
 from io import StringIO
 from pathlib import Path
-from typing import Iterator, Union
+from typing import Iterator, List, Union
 from unittest import mock
 
 from transformers import logging as transformers_logging
@@ -1561,3 +1562,25 @@ def to_2tuple(x):
     if isinstance(x, collections.abc.Iterable):
         return x
     return (x, x)
+
+
+# These utils relate to ensuring the right error message is received when running scripts
+class SubprocessCallException(Exception):
+    pass
+
+
+def run_command(command: List[str], return_stdout=False):
+    """
+    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
+    if an error occured while running `command`
+    """
+    try:
+        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
+        if return_stdout:
+            if hasattr(output, "decode"):
+                output = output.decode("utf-8")
+            return output
+    except subprocess.CalledProcessError as e:
+        raise SubprocessCallException(
+            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
+        ) from e