[s2s] test_distributed_eval (#8315)
Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
This commit is contained in:
@@ -450,7 +450,8 @@ Inside tests:
|
|||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
torch.cuda.device_count()
|
from transformers.testing_utils import get_gpu_count
|
||||||
|
n_gpu = get_gpu_count() # works with torch and tf
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -2,9 +2,9 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from transformers import BertTokenizer, EncoderDecoderModel, is_torch_available
|
from transformers import BertTokenizer, EncoderDecoderModel
|
||||||
from transformers.file_utils import is_datasets_available
|
from transformers.file_utils import is_datasets_available
|
||||||
from transformers.testing_utils import TestCasePlus, execute_subprocess_async, slow
|
from transformers.testing_utils import TestCasePlus, execute_subprocess_async, get_gpu_count, slow
|
||||||
from transformers.trainer_callback import TrainerState
|
from transformers.trainer_callback import TrainerState
|
||||||
from transformers.trainer_utils import set_seed
|
from transformers.trainer_utils import set_seed
|
||||||
|
|
||||||
@@ -13,9 +13,6 @@ from .seq2seq_trainer import Seq2SeqTrainer
|
|||||||
from .test_seq2seq_examples import MBART_TINY
|
from .test_seq2seq_examples import MBART_TINY
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
|
||||||
import torch
|
|
||||||
|
|
||||||
set_seed(42)
|
set_seed(42)
|
||||||
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
|
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
|
||||||
|
|
||||||
@@ -196,7 +193,7 @@ class TestFinetuneTrainer(TestCasePlus):
|
|||||||
""".split()
|
""".split()
|
||||||
# --eval_beams 2
|
# --eval_beams 2
|
||||||
|
|
||||||
n_gpu = torch.cuda.device_count()
|
n_gpu = get_gpu_count()
|
||||||
if n_gpu > 1:
|
if n_gpu > 1:
|
||||||
distributed_args = f"""
|
distributed_args = f"""
|
||||||
-m torch.distributed.launch
|
-m torch.distributed.launch
|
||||||
|
|||||||
@@ -3,7 +3,14 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from transformers.testing_utils import TestCasePlus, execute_subprocess_async, require_torch_multigpu
|
from transformers.testing_utils import (
|
||||||
|
TestCasePlus,
|
||||||
|
execute_subprocess_async,
|
||||||
|
get_gpu_count,
|
||||||
|
require_torch_gpu,
|
||||||
|
require_torch_multigpu,
|
||||||
|
slow,
|
||||||
|
)
|
||||||
|
|
||||||
from .test_seq2seq_examples import CHEAP_ARGS, make_test_data_dir
|
from .test_seq2seq_examples import CHEAP_ARGS, make_test_data_dir
|
||||||
from .utils import load_json
|
from .utils import load_json
|
||||||
@@ -80,3 +87,30 @@ class TestSummarizationDistillerMultiGPU(TestCasePlus):
|
|||||||
self.assertEqual(len(metrics["test"]), 1)
|
self.assertEqual(len(metrics["test"]), 1)
|
||||||
desired_n_evals = int(args_d["max_epochs"] * (1 / args_d["val_check_interval"]) / 2 + 1)
|
desired_n_evals = int(args_d["max_epochs"] * (1 / args_d["val_check_interval"]) / 2 + 1)
|
||||||
self.assertEqual(len(metrics["val"]), desired_n_evals)
|
self.assertEqual(len(metrics["val"]), desired_n_evals)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
@require_torch_gpu
|
||||||
|
def test_distributed_eval(self):
|
||||||
|
output_dir = self.get_auto_remove_tmp_dir()
|
||||||
|
args = f"""
|
||||||
|
--model_name Helsinki-NLP/opus-mt-en-ro
|
||||||
|
--save_dir {output_dir}
|
||||||
|
--data_dir test_data/wmt_en_ro
|
||||||
|
--num_beams 2
|
||||||
|
--task translation
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
# we want this test to run even if there is only one GPU, but if there are more we use them all
|
||||||
|
n_gpu = get_gpu_count()
|
||||||
|
distributed_args = f"""
|
||||||
|
-m torch.distributed.launch
|
||||||
|
--nproc_per_node={n_gpu}
|
||||||
|
{self.test_file_dir}/run_distributed_eval.py
|
||||||
|
""".split()
|
||||||
|
cmd = [sys.executable] + distributed_args + args
|
||||||
|
execute_subprocess_async(cmd, env=self.get_env())
|
||||||
|
|
||||||
|
metrics_save_path = os.path.join(output_dir, "test_bleu.json")
|
||||||
|
metrics = load_json(metrics_save_path)
|
||||||
|
# print(metrics)
|
||||||
|
self.assertGreaterEqual(metrics["bleu"], 25)
|
||||||
|
|||||||
@@ -297,6 +297,22 @@ def require_ray(test_case):
|
|||||||
return test_case
|
return test_case
|
||||||
|
|
||||||
|
|
||||||
|
def get_gpu_count():
|
||||||
|
"""
|
||||||
|
Return the number of available gpus (regardless of whether torch or tf is used)
|
||||||
|
"""
|
||||||
|
if _torch_available:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
return torch.cuda.device_count()
|
||||||
|
elif _tf_available:
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
return len(tf.config.list_physical_devices("GPU"))
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def get_tests_dir(append_path=None):
|
def get_tests_dir(append_path=None):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
|
|||||||
Reference in New Issue
Block a user