Merge branch 'master' of github.com:huggingface/transformers
This commit is contained in:
43
tests/deepspeed/ds_config_zero2.json
Normal file
43
tests/deepspeed/ds_config_zero2.json
Normal file
@@ -0,0 +1,43 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": true,
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 2e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": 2e8,
|
||||
"contiguous_gradients": true,
|
||||
"cpu_offload": true
|
||||
},
|
||||
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": 3e-5,
|
||||
"betas": [0.8, 0.999],
|
||||
"eps": 1e-8,
|
||||
"weight_decay": 3e-7
|
||||
}
|
||||
},
|
||||
|
||||
"scheduler": {
|
||||
"type": "WarmupLR",
|
||||
"params": {
|
||||
"warmup_min_lr": 0,
|
||||
"warmup_max_lr": 3e-5,
|
||||
"warmup_num_steps": 500
|
||||
}
|
||||
},
|
||||
|
||||
"steps_per_print": 2000,
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
48
tests/deepspeed/ds_config_zero3.json
Normal file
48
tests/deepspeed/ds_config_zero3.json
Normal file
@@ -0,0 +1,48 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": true,
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"cpu_offload": true,
|
||||
"cpu_offload_params": true,
|
||||
"cpu_offload_use_pin_memory" : true,
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e14,
|
||||
"reduce_bucket_size": 0,
|
||||
"stage3_prefetch_bucket_size": 0,
|
||||
"stage3_param_persistence_threshold": 0,
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_fp16_weights_on_model_save": true
|
||||
},
|
||||
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": 3e-5,
|
||||
"betas": [0.8, 0.999],
|
||||
"eps": 1e-8,
|
||||
"weight_decay": 3e-7
|
||||
}
|
||||
},
|
||||
|
||||
"scheduler": {
|
||||
"type": "WarmupLR",
|
||||
"params": {
|
||||
"warmup_min_lr": 0,
|
||||
"warmup_max_lr": 3e-5,
|
||||
"warmup_num_steps": 500
|
||||
}
|
||||
},
|
||||
|
||||
"steps_per_print": 2000,
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
637
tests/deepspeed/test_deepspeed.py
Normal file
637
tests/deepspeed/test_deepspeed.py
Normal file
@@ -0,0 +1,637 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import dataclasses
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import unittest
|
||||
from copy import deepcopy
|
||||
|
||||
from parameterized import parameterized
|
||||
from transformers import TrainingArguments, is_torch_available
|
||||
from transformers.file_utils import WEIGHTS_NAME
|
||||
from transformers.integrations import is_deepspeed_available
|
||||
from transformers.testing_utils import (
|
||||
CaptureLogger,
|
||||
ExtendSysPath,
|
||||
TestCasePlus,
|
||||
execute_subprocess_async,
|
||||
get_gpu_count,
|
||||
mockenv_context,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
)
|
||||
from transformers.trainer_utils import set_seed
|
||||
|
||||
|
||||
bindir = os.path.abspath(os.path.dirname(__file__))
|
||||
with ExtendSysPath(f"{bindir}/.."):
|
||||
from test_trainer import TrainerIntegrationCommon # noqa
|
||||
|
||||
if is_torch_available():
|
||||
from test_trainer import get_regression_trainer # noqa
|
||||
|
||||
|
||||
set_seed(42)
|
||||
MBART_TINY = "sshleifer/tiny-mbart"
|
||||
T5_SMALL = "t5-small"
|
||||
|
||||
|
||||
def load_json(path):
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
# a candidate for testing_utils
|
||||
def require_deepspeed(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires deepspeed
|
||||
"""
|
||||
if not is_deepspeed_available():
|
||||
return unittest.skip("test requires deepspeed")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
|
||||
|
||||
ZERO2 = "zero2"
|
||||
ZERO3 = "zero3"
|
||||
stages = [ZERO2, ZERO3]
|
||||
|
||||
|
||||
@require_deepspeed
|
||||
@require_torch_gpu
|
||||
class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
||||
"""
|
||||
|
||||
This class is for testing directly via get_regression_trainer
|
||||
|
||||
It mixes in `TrainerIntegrationCommon` which already has a lot of helper validation methods
|
||||
which we can re-use here.
|
||||
|
||||
Important: this class' setup can only work with a single gpu because it runs within the current
|
||||
pytest worker. For multi-gpu tests use TestDeepSpeedWithLauncher.
|
||||
|
||||
Note: if any of the tests of this class get run there will be at least one gpu occupied by them
|
||||
until this pytest worker exits. This is because the gpu memory allocated by the cuda-kernels
|
||||
won't be released until this pytest worker exits.
|
||||
|
||||
This may appear as some run-away tests if you watch `nvidia-smi` while other tests that fork new
|
||||
processes are run. So there will be one or two "stale" processes reported in `nvidia-smi`. This
|
||||
is not a bug.
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
args = TrainingArguments(".")
|
||||
self.n_epochs = args.num_train_epochs
|
||||
self.batch_size = args.train_batch_size
|
||||
|
||||
self.dist_env_1_gpu = dict(
|
||||
MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
|
||||
)
|
||||
|
||||
self.ds_config_file = {}
|
||||
self.ds_config_file[ZERO2] = f"{self.test_file_dir_str}/ds_config_zero2.json"
|
||||
self.ds_config_file[ZERO3] = f"{self.test_file_dir_str}/ds_config_zero3.json"
|
||||
|
||||
# use self.get_config_dict(stage) to use these to ensure the original is not modified
|
||||
self.ds_config_dict = {}
|
||||
with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
|
||||
self.ds_config_dict[ZERO2] = json.load(f)
|
||||
with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
|
||||
self.ds_config_dict[ZERO3] = json.load(f)
|
||||
|
||||
def get_config_dict(self, stage):
|
||||
""" As the tests modify the dict, always make a copy """
|
||||
config = deepcopy(self.ds_config_dict[stage])
|
||||
if stage == ZERO3:
|
||||
# This setting slows things down, so don't enable it by default unless needed by a test.
|
||||
# It's in the file as a demo for users since we want everything to work out of the box even if slower.
|
||||
config["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
|
||||
return config
|
||||
|
||||
# --- These tests are enough to run on one of zero stages --- #
|
||||
|
||||
# Test various combos
|
||||
# 1. DS scheduler + DS optimizer: this is already tested by most other tests
|
||||
# 2. HF scheduler + HF optimizer:
|
||||
# 3. DS scheduler + HF optimizer:
|
||||
# 4. HF scheduler + DS optimizer:
|
||||
|
||||
def test_hf_scheduler_hf_optimizer(self):
|
||||
a = 0
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
ds_config_zero2_dict = self.get_config_dict(ZERO2)
|
||||
del ds_config_zero2_dict["optimizer"] # force default HF Trainer optimizer
|
||||
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
|
||||
ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = False
|
||||
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||
trainer = get_regression_trainer(a=a, local_rank=0, deepspeed=ds_config_zero2_dict)
|
||||
trainer.train()
|
||||
new_a = trainer.model.a.item()
|
||||
self.assertNotEqual(new_a, a)
|
||||
|
||||
def test_ds_scheduler_hf_optimizer(self):
|
||||
a = 0
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
ds_config_zero2_dict = self.get_config_dict(ZERO2)
|
||||
del ds_config_zero2_dict["optimizer"] # force default HF Trainer optimizer
|
||||
ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = False
|
||||
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||
trainer = get_regression_trainer(a=a, local_rank=0, deepspeed=ds_config_zero2_dict)
|
||||
trainer.train()
|
||||
new_a = trainer.model.a.item()
|
||||
self.assertNotEqual(new_a, a)
|
||||
|
||||
def test_hf_scheduler_ds_optimizer(self):
|
||||
# this combo is not possible at the moment
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
ds_config_zero2_dict = self.get_config_dict(ZERO2)
|
||||
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
|
||||
ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = False
|
||||
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||
trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict)
|
||||
with self.assertRaises(Exception) as context:
|
||||
trainer.train()
|
||||
self.assertTrue("HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception))
|
||||
|
||||
def test_hf_optimizer_with_offload(self):
|
||||
# must not allow non-DS optimizer when using ZERO-offload
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
ds_config_zero2_dict = self.get_config_dict(ZERO2)
|
||||
del ds_config_zero2_dict["optimizer"] # force default HF Trainer optimizer
|
||||
ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = True
|
||||
# sanity check - should the default config change
|
||||
assert (
|
||||
"cpu_offload" in ds_config_zero2_dict["zero_optimization"]
|
||||
and ds_config_zero2_dict["zero_optimization"]["cpu_offload"] is True
|
||||
), "ensure the config is set up correctly"
|
||||
trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict)
|
||||
with self.assertRaises(Exception) as context:
|
||||
trainer.train()
|
||||
self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))
|
||||
|
||||
# --- These tests need to run on both zero stages --- #
|
||||
@parameterized.expand(stages)
|
||||
def test_fake_notebook_no_launcher(self, stage):
|
||||
# this setup emulates a notebook where a launcher needs to be emulated by hand
|
||||
|
||||
# note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture
|
||||
# DeepSpeed log if this test happens to run first in this pytest worker. But it will fail if
|
||||
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
|
||||
# to reset `logger.handlers[0].setStream(sys.stdout)` or directly capture from the logger.
|
||||
from deepspeed.utils import logger
|
||||
|
||||
with CaptureLogger(logger) as cs:
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage])
|
||||
trainer.train()
|
||||
assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
|
||||
|
||||
@parameterized.expand(stages)
|
||||
def test_early_get_last_lr(self, stage):
|
||||
# with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
|
||||
# not run for the first few dozen steps while loss scale is too large, and thus during
|
||||
# that time `get_last_lr` will fail if called during that warm up stage,
|
||||
#
|
||||
# setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls
|
||||
# `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
a = b = 0.0
|
||||
trainer = get_regression_trainer(
|
||||
a=a,
|
||||
b=b,
|
||||
local_rank=0,
|
||||
train_len=8,
|
||||
deepspeed=self.ds_config_file[stage],
|
||||
per_device_train_batch_size=8,
|
||||
logging_steps=1,
|
||||
)
|
||||
trainer.train()
|
||||
post_train_a = trainer.model.a.item()
|
||||
|
||||
# XXX: for some reason the following check fails with zero3 - not a broken but a
|
||||
# different qualitative outcome - need to investigate at some point
|
||||
if stage == ZERO3:
|
||||
return
|
||||
|
||||
# it's enough that train didn't fail for this test, but we must check that
|
||||
# optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
|
||||
self.assertEqual(post_train_a, a)
|
||||
|
||||
@parameterized.expand(stages)
|
||||
def test_gradient_accumulation(self, stage):
|
||||
# this test measures that we get identical weights and similar loss with:
|
||||
# 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
|
||||
# 2. per_device_train_batch_size=4, gradient_accumulation_steps=2
|
||||
# since the 2nd should produce the effective batch of 1st, with the same results
|
||||
#
|
||||
# I can get an identical loss for a small train_len=32, plus the power of the initial
|
||||
# dynamic loss scale value set to:
|
||||
# "fp16.initial_scale_power": 1
|
||||
# plus having the same WarmupLR's warmup_min_lr == warmup_max_lr in the config file
|
||||
# but for some reason going to train_len=64 the weights, weights start to mismatch with this setup.
|
||||
# the culprit seems to be `initial_scale_power` - putting it back to its default 32 keeps the weights identical
|
||||
|
||||
train_len = 64
|
||||
a = b = 0.0
|
||||
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
no_grad_accum_trainer = get_regression_trainer(
|
||||
a=a,
|
||||
b=b,
|
||||
local_rank=0,
|
||||
train_len=train_len,
|
||||
deepspeed=self.ds_config_file[stage],
|
||||
per_device_train_batch_size=8,
|
||||
gradient_accumulation_steps=1,
|
||||
)
|
||||
no_grad_accum_result = no_grad_accum_trainer.train()
|
||||
no_grad_accum_loss = no_grad_accum_result.training_loss
|
||||
no_grad_accum_a = no_grad_accum_trainer.model.a.item()
|
||||
no_grad_accum_b = no_grad_accum_trainer.model.b.item()
|
||||
# make sure the optimizer kicked in - if it hasn't changed from the original value of a then make train_len bigger
|
||||
self.assertNotEqual(no_grad_accum_a, a)
|
||||
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
yes_grad_accum_trainer = get_regression_trainer(
|
||||
a=a,
|
||||
b=b,
|
||||
local_rank=0,
|
||||
train_len=train_len,
|
||||
deepspeed=self.ds_config_file[stage],
|
||||
per_device_train_batch_size=4,
|
||||
gradient_accumulation_steps=2,
|
||||
)
|
||||
yes_grad_accum_result = yes_grad_accum_trainer.train()
|
||||
yes_grad_accum_loss = yes_grad_accum_result.training_loss
|
||||
yes_grad_accum_a = yes_grad_accum_trainer.model.a.item()
|
||||
yes_grad_accum_b = yes_grad_accum_trainer.model.b.item()
|
||||
self.assertNotEqual(yes_grad_accum_a, a)
|
||||
|
||||
# training with half the batch size but accumulation steps as 2 should give the same weights
|
||||
self.assertEqual(no_grad_accum_a, yes_grad_accum_a)
|
||||
self.assertEqual(no_grad_accum_b, yes_grad_accum_b)
|
||||
|
||||
# see the note above how to get identical loss on a small bs
|
||||
self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=5)
|
||||
|
||||
def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage):
|
||||
# adapted from TrainerIntegrationCommon.check_saved_checkpoints
|
||||
|
||||
file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]
|
||||
|
||||
if stage == ZERO2:
|
||||
ds_file_list = ["mp_rank_00_model_states.pt"]
|
||||
elif stage == ZERO3:
|
||||
ds_file_list = ["zero_pp_rank_0_mp_rank_00_model_states.pt"]
|
||||
else:
|
||||
raise ValueError(f"unknown stage {stage}")
|
||||
|
||||
# XXX: this can be recoded and then removed once we require deepspeed>0.3.13
|
||||
from packaging import version
|
||||
|
||||
import deepspeed
|
||||
|
||||
if version.parse(deepspeed.__version__) > version.parse("0.3.13"):
|
||||
ds_file_list.append("zero_pp_rank_0_mp_rank_00_optim_states.pt")
|
||||
else:
|
||||
ds_file_list.append("zero_pp_rank_0_mp_rank_00optim_states.pt")
|
||||
|
||||
for step in range(freq, total, freq):
|
||||
checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
|
||||
self.assertTrue(os.path.isdir(checkpoint), f"[{stage}] {checkpoint} dir is not found")
|
||||
|
||||
# common files
|
||||
for filename in file_list:
|
||||
path = os.path.join(checkpoint, filename)
|
||||
self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
|
||||
|
||||
# ds files
|
||||
ds_path = os.path.join(checkpoint, f"global_step{step}")
|
||||
for filename in ds_file_list:
|
||||
# filename = os.path.join(path, filename)
|
||||
# print(filename)
|
||||
path = os.path.join(ds_path, filename)
|
||||
self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
|
||||
|
||||
@parameterized.expand(stages)
|
||||
def test_save_checkpoints(self, stage):
|
||||
# adapted from TrainerIntegrationTest.test_save_checkpoints
|
||||
|
||||
freq = 5
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
ds_config_dict = self.get_config_dict(stage)
|
||||
ds_config_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||
if stage == ZERO3:
|
||||
ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
|
||||
|
||||
# save checkpoints
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
trainer = get_regression_trainer(
|
||||
output_dir=output_dir,
|
||||
save_steps=freq,
|
||||
deepspeed=ds_config_dict,
|
||||
)
|
||||
trainer.train()
|
||||
|
||||
total = int(self.n_epochs * 64 / self.batch_size)
|
||||
self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)
|
||||
|
||||
@parameterized.expand(stages)
|
||||
def test_can_resume_training_errors(self, stage):
|
||||
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
ds_config_dict = self.get_config_dict(stage)
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
trainer = get_regression_trainer(output_dir=output_dir, deepspeed=ds_config_dict)
|
||||
|
||||
# 1. fail to find any checkpoint - due a fresh output_dir
|
||||
with self.assertRaises(Exception) as context:
|
||||
trainer.train(resume_from_checkpoint=True)
|
||||
self.assertTrue(
|
||||
"No valid checkpoint found in output directory" in str(context.exception),
|
||||
f"got exception: {context.exception}",
|
||||
)
|
||||
|
||||
# 2. fail to find a bogus checkpoint
|
||||
with self.assertRaises(Exception) as context:
|
||||
checkpoint = os.path.join(output_dir, "checkpoint-5")
|
||||
trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
|
||||
self.assertTrue(
|
||||
"Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}"
|
||||
)
|
||||
|
||||
@parameterized.expand(stages)
|
||||
def test_can_resume_training_normal(self, stage):
|
||||
# adapted from TrainerIntegrationTest.test_can_resume_training
|
||||
# test normal resume for each stage separately, error-handling is tested in a different test
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
ds_config_dict = self.get_config_dict(stage)
|
||||
ds_config_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||
if stage == ZERO3:
|
||||
ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True
|
||||
|
||||
kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
|
||||
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
trainer = get_regression_trainer(**kwargs)
|
||||
trainer.train()
|
||||
(a, b) = trainer.model.a.item(), trainer.model.b.item()
|
||||
state = dataclasses.asdict(trainer.state)
|
||||
|
||||
checkpoint = os.path.join(output_dir, "checkpoint-5")
|
||||
|
||||
# Reinitialize trainer
|
||||
trainer = get_regression_trainer(**kwargs)
|
||||
|
||||
trainer.train(resume_from_checkpoint=checkpoint)
|
||||
(a1, b1) = trainer.model.a.item(), trainer.model.b.item()
|
||||
state1 = dataclasses.asdict(trainer.state)
|
||||
self.assertEqual(a, a1)
|
||||
self.assertEqual(b, b1)
|
||||
self.check_trainer_state_are_the_same(state, state1)
|
||||
|
||||
# Now check with a later checkpoint that it also works when we span over one epoch
|
||||
checkpoint = os.path.join(output_dir, "checkpoint-15")
|
||||
|
||||
# Reinitialize trainer and load model
|
||||
trainer = get_regression_trainer(**kwargs)
|
||||
|
||||
trainer.train(resume_from_checkpoint=checkpoint)
|
||||
(a1, b1) = trainer.model.a.item(), trainer.model.b.item()
|
||||
state1 = dataclasses.asdict(trainer.state)
|
||||
self.assertEqual(a, a1)
|
||||
self.assertEqual(b, b1)
|
||||
self.check_trainer_state_are_the_same(state, state1)
|
||||
|
||||
|
||||
@slow
|
||||
@require_deepspeed
|
||||
@require_torch_gpu
|
||||
class TestDeepSpeedWithLauncher(TestCasePlus):
|
||||
""" This class is for testing via an external script - can do multiple gpus """
|
||||
|
||||
# Tests to devise #
|
||||
#
|
||||
# 1. predict_with_generate on multigpu - need to figure out how to give input sequences so that
|
||||
# the 2 gpus will generate prediction sequences that aren't of the same length - this is because
|
||||
# we had to code a special feature to sync the gpus when the predicted sequences aren't of the
|
||||
# same length. In general this will tested as a side-effect through a variety of other tests -
|
||||
# it'll simply hang trying to synchronize with other gpus if this problem is encountered. So as
|
||||
# long as we have a few full tests running on zero3 + predict_with_generate this should be
|
||||
# mostly covered.
|
||||
#
|
||||
# but there are 5 variations on beam search in `generate`- with identical code branched with `if
|
||||
# synced_gpus`
|
||||
#
|
||||
# 2. most tests should probably be run on both: zero2 and zero3 configs
|
||||
#
|
||||
|
||||
@require_torch_multi_gpu
|
||||
@parameterized.expand(stages)
|
||||
def test_basic_distributed(self, stage):
|
||||
self.run_and_check(stage=stage, distributed=True)
|
||||
|
||||
@parameterized.expand(stages)
|
||||
def test_do_eval_no_train(self, stage):
|
||||
# we should not fail if train is skipped
|
||||
self.run_and_check(
|
||||
stage=stage,
|
||||
eval_steps=1,
|
||||
distributed=False,
|
||||
do_train=False,
|
||||
do_eval=True,
|
||||
)
|
||||
|
||||
@parameterized.expand(stages)
|
||||
def test_resume_train_not_from_ds_checkpoint(self, stage):
|
||||
# do normal training and then resume not from the deepspeed checkpoint but explicitly from
|
||||
# the saved model dir
|
||||
|
||||
do_train = True
|
||||
do_eval = False
|
||||
kwargs = dict(stage=stage, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval)
|
||||
|
||||
# 1. normal training
|
||||
output_dir = self.run_and_check(**kwargs)
|
||||
|
||||
# 2. now resume explicitly from the saved weights, by passing --model_name_or_path output_dir
|
||||
# - i.e. the same path the model was saved to in step 1
|
||||
output_dir = self.run_trainer(**kwargs, model_name=output_dir)
|
||||
|
||||
self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
|
||||
|
||||
def do_checks(self, output_dir, do_train=True, do_eval=True):
|
||||
|
||||
if do_train:
|
||||
train_metrics = load_json(os.path.join(output_dir, "train_results.json"))
|
||||
self.assertIn("train_samples_per_second", train_metrics)
|
||||
self.assertGreater(train_metrics["train_samples_per_second"], 0.5)
|
||||
|
||||
if do_eval:
|
||||
eval_metrics = load_json(os.path.join(output_dir, "eval_results.json"))
|
||||
self.assertIn("eval_bleu", eval_metrics)
|
||||
self.assertGreater(eval_metrics["eval_bleu"], 0)
|
||||
|
||||
# XXX: need to do better validation beyond just that the run was successful
|
||||
def run_and_check(
|
||||
self,
|
||||
stage,
|
||||
eval_steps=10,
|
||||
distributed=True,
|
||||
do_train=True,
|
||||
do_eval=True,
|
||||
extra_args_str=None,
|
||||
remove_args_str=None,
|
||||
):
|
||||
|
||||
# we are doing quality testing so using a small real model
|
||||
output_dir = self.run_trainer(
|
||||
stage=stage,
|
||||
model_name=T5_SMALL,
|
||||
eval_steps=eval_steps,
|
||||
num_train_epochs=1,
|
||||
do_train=do_train,
|
||||
do_eval=do_eval,
|
||||
distributed=distributed,
|
||||
extra_args_str=extra_args_str,
|
||||
remove_args_str=remove_args_str,
|
||||
)
|
||||
|
||||
self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
|
||||
|
||||
return output_dir
|
||||
|
||||
def run_trainer(
|
||||
self,
|
||||
stage: str,
|
||||
model_name: str,
|
||||
eval_steps: int = 10,
|
||||
num_train_epochs: int = 1,
|
||||
do_train: bool = False,
|
||||
do_eval: bool = True,
|
||||
distributed: bool = True,
|
||||
extra_args_str: str = None,
|
||||
remove_args_str: str = None,
|
||||
):
|
||||
max_len = 32
|
||||
data_dir = self.examples_dir / "test_data/wmt_en_ro"
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
args = f"""
|
||||
--model_name_or_path {model_name}
|
||||
--train_file {data_dir}/train.json
|
||||
--validation_file {data_dir}/val.json
|
||||
--output_dir {output_dir}
|
||||
--overwrite_output_dir
|
||||
--max_source_length {max_len}
|
||||
--max_target_length {max_len}
|
||||
--val_max_target_length {max_len}
|
||||
--warmup_steps 8
|
||||
--predict_with_generate
|
||||
--logging_steps 0
|
||||
--save_steps 0
|
||||
--eval_steps {eval_steps}
|
||||
--group_by_length
|
||||
--label_smoothing_factor 0.1
|
||||
--adafactor
|
||||
--source_lang en
|
||||
--target_lang ro
|
||||
""".split()
|
||||
args.extend(["--source_prefix", '"translate English to Romanian: "'])
|
||||
|
||||
actions = 0
|
||||
if do_train:
|
||||
actions += 1
|
||||
args.extend(
|
||||
f"""
|
||||
--do_train
|
||||
--num_train_epochs {str(num_train_epochs)}
|
||||
--max_train_samples 100
|
||||
--per_device_train_batch_size 2
|
||||
--learning_rate 3e-3
|
||||
""".split()
|
||||
)
|
||||
|
||||
if do_eval:
|
||||
actions += 1
|
||||
args.extend(
|
||||
"""
|
||||
--do_eval
|
||||
--max_val_samples 100
|
||||
--per_device_eval_batch_size 2
|
||||
""".split()
|
||||
)
|
||||
|
||||
assert actions > 0, "need at least do_train or do_eval for the test to run"
|
||||
|
||||
if extra_args_str is not None:
|
||||
args.extend(extra_args_str.split())
|
||||
|
||||
# currently only works for bool args
|
||||
if remove_args_str is not None:
|
||||
remove_args = remove_args_str.split()
|
||||
args = [x for x in args if x not in remove_args]
|
||||
|
||||
ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
|
||||
script = [f"{self.examples_dir_str}/seq2seq/run_translation.py"]
|
||||
num_gpus = get_gpu_count() if distributed else 1
|
||||
launcher = f"deepspeed --num_gpus {num_gpus}".split()
|
||||
|
||||
cmd = launcher + script + args + ds_args
|
||||
# keep for quick debug
|
||||
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
|
||||
return output_dir
|
||||
|
||||
@parameterized.expand(stages)
|
||||
def test_clm(self, stage):
|
||||
# this test exercises model.resize_token_embeddings() which requires param gathering outside
|
||||
# of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
|
||||
|
||||
data_dir = self.tests_dir / "fixtures"
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
args = f"""
|
||||
--model_name_or_path sshleifer/tiny-gpt2
|
||||
--train_file {data_dir}/sample_text.txt
|
||||
--validation_file {data_dir}/sample_text.txt
|
||||
--output_dir {output_dir}
|
||||
--overwrite_output_dir
|
||||
--do_train
|
||||
--do_eval
|
||||
--max_train_samples 10
|
||||
--max_val_samples 10
|
||||
--per_device_train_batch_size 5
|
||||
--per_device_eval_batch_size 5
|
||||
--num_train_epochs 1
|
||||
--warmup_steps 8
|
||||
--block_size 128
|
||||
""".split()
|
||||
|
||||
distributed = True
|
||||
ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
|
||||
script = [f"{self.examples_dir_str}/language-modeling/run_clm.py"]
|
||||
num_gpus = get_gpu_count() if distributed else 1
|
||||
launcher = f"deepspeed --num_gpus {num_gpus}".split()
|
||||
|
||||
cmd = launcher + script + args + ds_args
|
||||
# keep for quick debug
|
||||
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
|
||||
return output_dir
|
||||
238
tests/extended/test_trainer_ext.py
Normal file
238
tests/extended/test_trainer_ext.py
Normal file
@@ -0,0 +1,238 @@
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from transformers.file_utils import is_apex_available
|
||||
from transformers.integrations import is_fairscale_available
|
||||
from transformers.testing_utils import (
|
||||
ExtendSysPath,
|
||||
TestCasePlus,
|
||||
execute_subprocess_async,
|
||||
get_gpu_count,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
require_torch_non_multi_gpu,
|
||||
slow,
|
||||
)
|
||||
from transformers.trainer_callback import TrainerState
|
||||
from transformers.trainer_utils import set_seed
|
||||
|
||||
|
||||
bindir = os.path.abspath(os.path.dirname(__file__))
|
||||
with ExtendSysPath(f"{bindir}/../../examples/seq2seq"):
|
||||
from run_translation import main # noqa
|
||||
|
||||
|
||||
set_seed(42)
|
||||
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"
|
||||
MBART_TINY = "sshleifer/tiny-mbart"
|
||||
|
||||
|
||||
# a candidate for testing_utils
|
||||
def require_fairscale(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires fairscale
|
||||
"""
|
||||
if not is_fairscale_available():
|
||||
return unittest.skip("test requires fairscale")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
|
||||
|
||||
# a candidate for testing_utils
|
||||
def require_apex(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires apex
|
||||
"""
|
||||
if not is_apex_available():
|
||||
return unittest.skip("test requires apex")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
|
||||
|
||||
class TestTrainerExt(TestCasePlus):
|
||||
def run_seq2seq_quick(self, distributed=False, extra_args_str=None, predict_with_generate=True):
|
||||
output_dir = self.run_trainer(
|
||||
eval_steps=1,
|
||||
max_len=12,
|
||||
model_name=MBART_TINY,
|
||||
num_train_epochs=1,
|
||||
distributed=distributed,
|
||||
extra_args_str=extra_args_str,
|
||||
predict_with_generate=predict_with_generate,
|
||||
)
|
||||
logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
|
||||
eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
|
||||
|
||||
first_step_stats = eval_metrics[0]
|
||||
if predict_with_generate:
|
||||
assert "eval_bleu" in first_step_stats
|
||||
|
||||
last_step_stats = eval_metrics[-1]
|
||||
assert isinstance(last_step_stats["eval_bleu"], float)
|
||||
assert not math.isnan(float(last_step_stats["eval_loss"])), "eval_loss must not be `nan`"
|
||||
|
||||
@require_torch_non_multi_gpu
|
||||
def test_run_seq2seq_no_dist(self):
|
||||
self.run_seq2seq_quick()
|
||||
|
||||
# verify that the trainer can handle non-distributed with n_gpu > 1
|
||||
@require_torch_multi_gpu
|
||||
def test_run_seq2seq_dp(self):
|
||||
self.run_seq2seq_quick(distributed=False)
|
||||
|
||||
# verify that the trainer can handle distributed with n_gpu > 1
|
||||
@require_torch_multi_gpu
|
||||
def test_run_seq2seq_ddp(self):
|
||||
self.run_seq2seq_quick(distributed=True)
|
||||
|
||||
# test --sharded_ddp w/o --fp16
|
||||
@require_torch_multi_gpu
|
||||
@require_fairscale
|
||||
def test_run_seq2seq_sharded_ddp(self):
|
||||
self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple")
|
||||
|
||||
# test --sharded_ddp w/ --fp16
|
||||
@require_torch_multi_gpu
|
||||
@require_fairscale
|
||||
def test_run_seq2seq_sharded_ddp_fp16(self):
|
||||
self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple --fp16")
|
||||
|
||||
# test --sharded_ddp zero_dp_2 w/o --fp16
|
||||
@require_torch_multi_gpu
|
||||
@require_fairscale
|
||||
def test_run_seq2seq_fully_sharded_ddp(self):
|
||||
self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False)
|
||||
|
||||
# test --sharded_ddp zero_dp_2 w/ --fp16
|
||||
@require_torch_multi_gpu
|
||||
@require_fairscale
|
||||
def test_run_seq2seq_fully_sharded_ddp_fp16(self):
|
||||
self.run_seq2seq_quick(
|
||||
distributed=True, extra_args_str="--sharded_ddp zero_dp_2 --fp16", predict_with_generate=False
|
||||
)
|
||||
|
||||
@require_apex
|
||||
@require_torch_gpu
|
||||
def test_run_seq2seq_apex(self):
|
||||
# XXX: apex breaks the trainer if it's run twice e.g. run_seq2seq.main() from the same
|
||||
# program and it breaks other tests that run from the same pytest worker, therefore until this is
|
||||
# sorted out it must be run only in an external program, that is distributed=True in this
|
||||
# test and only under one or more gpus - if we want cpu will need to make a special test
|
||||
#
|
||||
# specifically to the problem traced it to self.optimizer.step() - if it's run 2nd time via
|
||||
# 2nd main() call it botches the future eval.
|
||||
#
|
||||
self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
|
||||
# test 2nd time - was getting eval_loss': nan'
|
||||
# to reproduce the problem set distributed=False
|
||||
self.run_seq2seq_quick(distributed=True, extra_args_str="--fp16 --fp16_backend=apex")
|
||||
|
||||
@slow
|
||||
def test_run_seq2seq_slow(self):
|
||||
output_dir = self.run_trainer(
|
||||
eval_steps=2,
|
||||
max_len=128,
|
||||
model_name=MARIAN_MODEL,
|
||||
learning_rate=3e-4,
|
||||
num_train_epochs=10,
|
||||
distributed=False,
|
||||
)
|
||||
|
||||
# Check metrics
|
||||
logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
|
||||
eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
|
||||
first_step_stats = eval_metrics[0]
|
||||
last_step_stats = eval_metrics[-1]
|
||||
|
||||
assert first_step_stats["eval_loss"] > last_step_stats["eval_loss"], "model learned nothing"
|
||||
assert isinstance(last_step_stats["eval_bleu"], float)
|
||||
|
||||
# test if do_predict saves generations and metrics
|
||||
contents = os.listdir(output_dir)
|
||||
contents = {os.path.basename(p) for p in contents}
|
||||
assert "test_generations.txt" in contents
|
||||
assert "test_results.json" in contents
|
||||
|
||||
def run_trainer(
|
||||
self,
|
||||
eval_steps: int,
|
||||
max_len: int,
|
||||
model_name: str,
|
||||
num_train_epochs: int,
|
||||
learning_rate: float = 3e-3,
|
||||
distributed: bool = False,
|
||||
extra_args_str: str = None,
|
||||
predict_with_generate: bool = True,
|
||||
):
|
||||
data_dir = self.examples_dir / "test_data/wmt_en_ro"
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
args = f"""
|
||||
--model_name_or_path {model_name}
|
||||
--train_file {data_dir}/train.json
|
||||
--validation_file {data_dir}/val.json
|
||||
--test_file {data_dir}/test.json
|
||||
--output_dir {output_dir}
|
||||
--overwrite_output_dir
|
||||
--max_train_samples 8
|
||||
--max_val_samples 8
|
||||
--max_source_length {max_len}
|
||||
--max_target_length {max_len}
|
||||
--val_max_target_length {max_len}
|
||||
--do_train
|
||||
--do_eval
|
||||
--do_predict
|
||||
--num_train_epochs {str(num_train_epochs)}
|
||||
--per_device_train_batch_size 4
|
||||
--per_device_eval_batch_size 4
|
||||
--learning_rate {learning_rate}
|
||||
--warmup_steps 8
|
||||
--evaluation_strategy steps
|
||||
--logging_steps 0
|
||||
--eval_steps {str(eval_steps)}
|
||||
--save_steps {str(eval_steps)}
|
||||
--group_by_length
|
||||
--label_smoothing_factor 0.1
|
||||
--adafactor
|
||||
--target_lang ro_RO
|
||||
--source_lang en_XX
|
||||
"""
|
||||
if predict_with_generate:
|
||||
args += "--predict_with_generate"
|
||||
|
||||
args = args.split()
|
||||
|
||||
if extra_args_str is not None:
|
||||
args.extend(extra_args_str.split())
|
||||
|
||||
if distributed:
|
||||
n_gpu = get_gpu_count()
|
||||
distributed_args = f"""
|
||||
-m torch.distributed.launch
|
||||
--nproc_per_node={n_gpu}
|
||||
{self.examples_dir_str}/seq2seq/run_translation.py
|
||||
""".split()
|
||||
cmd = [sys.executable] + distributed_args + args
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
else:
|
||||
testargs = ["run_translation.py"] + args
|
||||
with patch.object(sys, "argv", testargs):
|
||||
main()
|
||||
|
||||
return output_dir
|
||||
@@ -136,10 +136,7 @@ images:
|
||||
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /,
|
||||
*CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
|
||||
```
|
||||
2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests).
|
||||
|
||||
TODO: Add a screenshot of PR + Text template to make it easy to open.
|
||||
|
||||
2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). You can take a look at this [PR](https://github.com/aws/deep-learning-containers/pull/1016), which information are needed.
|
||||
|
||||
## Current Tests
|
||||
|
||||
@@ -150,4 +147,4 @@ TODO: Add a screenshot of PR + Text template to make it easy to open.
|
||||
| pytorch-transfromers-test-2-smd | test bert finetuning using BERT from transformer lib+ PT SM DDP | SageMaker createTrainingJob | 16 | train_runtime, eval_accuracy & eval_loss |
|
||||
| pytorch-transfromers-test-1-smp | test roberta finetuning using BERT from transformer lib+ PT SM MP | SageMaker createTrainingJob | 8 | train_runtime, eval_accuracy & eval_loss |
|
||||
| tensorflow-transfromers-test-single | Test bert finetuning using BERT from transformer lib+TF | SageMaker createTrainingJob | 1 | train_runtime, eval_accuracy & eval_loss |
|
||||
| tensorflow-transfromers-test-2-smd | test bert finetuning using BERT from transformer lib+ TF SM DDP | SageMaker createTrainingJob | 16 | train_runtime, eval_accuracy & eval_loss |
|
||||
| tensorflow-transfromers-test-2-smd | test bert finetuning using BERT from transformer lib+ TF SM DDP | SageMaker createTrainingJob | 16 | train_runtime, eval_accuracy & eval_loss |
|
||||
|
||||
@@ -146,11 +146,8 @@ class DataCollatorIntegrationTest(unittest.TestCase):
|
||||
self.assertEqual(batch["labels"].shape, torch.Size([2, 6]))
|
||||
self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3)
|
||||
|
||||
def test_data_collator_for_language_modeling(self):
|
||||
def _test_no_pad_and_pad(self, no_pad_features, pad_features):
|
||||
tokenizer = BertTokenizer(self.vocab_file)
|
||||
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
|
||||
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
|
||||
batch = data_collator(no_pad_features)
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
|
||||
@@ -160,6 +157,15 @@ class DataCollatorIntegrationTest(unittest.TestCase):
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)
|
||||
batch = data_collator(no_pad_features)
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
|
||||
|
||||
batch = data_collator(pad_features)
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
|
||||
|
||||
tokenizer._pad_token = None
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
|
||||
with self.assertRaises(ValueError):
|
||||
@@ -185,6 +191,32 @@ class DataCollatorIntegrationTest(unittest.TestCase):
|
||||
self.assertTrue(torch.any(masked_tokens))
|
||||
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
|
||||
batch = data_collator(no_pad_features)
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
|
||||
|
||||
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
|
||||
self.assertTrue(torch.any(masked_tokens))
|
||||
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
|
||||
|
||||
batch = data_collator(pad_features)
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
|
||||
|
||||
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
|
||||
self.assertTrue(torch.any(masked_tokens))
|
||||
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
|
||||
|
||||
def test_data_collator_for_language_modeling(self):
|
||||
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
|
||||
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
|
||||
self._test_no_pad_and_pad(no_pad_features, pad_features)
|
||||
|
||||
no_pad_features = [list(range(10)), list(range(10))]
|
||||
pad_features = [list(range(5)), list(range(10))]
|
||||
self._test_no_pad_and_pad(no_pad_features, pad_features)
|
||||
|
||||
def test_plm(self):
|
||||
tokenizer = BertTokenizer(self.vocab_file)
|
||||
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
|
||||
@@ -225,6 +257,14 @@ class DataCollatorIntegrationTest(unittest.TestCase):
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
|
||||
self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
|
||||
batch = data_collator(features)
|
||||
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
|
||||
|
||||
def test_sop(self):
|
||||
tokenizer = BertTokenizer(self.vocab_file)
|
||||
features = [
|
||||
@@ -242,3 +282,11 @@ class DataCollatorIntegrationTest(unittest.TestCase):
|
||||
self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
|
||||
self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
|
||||
batch = data_collator(features)
|
||||
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_torch, slow, torch_device
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -234,7 +235,7 @@ class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
|
||||
if return_labels:
|
||||
if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
)
|
||||
|
||||
@@ -13,7 +13,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import copy
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
@@ -46,6 +47,8 @@ if is_torch_available():
|
||||
BertForSequenceClassification,
|
||||
BertForTokenClassification,
|
||||
BertModel,
|
||||
FunnelBaseModel,
|
||||
FunnelModel,
|
||||
GPT2Config,
|
||||
GPT2LMHeadModel,
|
||||
RobertaForMaskedLM,
|
||||
@@ -218,6 +221,21 @@ class AutoModelTest(unittest.TestCase):
|
||||
self.assertEqual(model.num_parameters(), 14410)
|
||||
self.assertEqual(model.num_parameters(only_trainable=True), 14410)
|
||||
|
||||
def test_from_pretrained_with_tuple_values(self):
|
||||
# For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
|
||||
model = AutoModel.from_pretrained("sgugger/funnel-random-tiny")
|
||||
self.assertIsInstance(model, FunnelModel)
|
||||
|
||||
config = copy.deepcopy(model.config)
|
||||
config.architectures = ["FunnelBaseModel"]
|
||||
model = AutoModel.from_config(config)
|
||||
self.assertIsInstance(model, FunnelBaseModel)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model.save_pretrained(tmp_dir)
|
||||
model = AutoModel.from_pretrained(tmp_dir)
|
||||
self.assertIsInstance(model, FunnelBaseModel)
|
||||
|
||||
def test_parents_and_children_in_mappings(self):
|
||||
# Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
|
||||
# by the parents and will return the wrong configuration type when using auto models
|
||||
@@ -242,6 +260,12 @@ class AutoModelTest(unittest.TestCase):
|
||||
assert not issubclass(
|
||||
child_config, parent_config
|
||||
), f"{child_config.__name__} is child of {parent_config.__name__}"
|
||||
assert not issubclass(
|
||||
child_model, parent_model
|
||||
), f"{child_config.__name__} is child of {parent_config.__name__}"
|
||||
|
||||
# Tuplify child_model and parent_model since some of them could be tuples.
|
||||
if not isinstance(child_model, (list, tuple)):
|
||||
child_model = (child_model,)
|
||||
if not isinstance(parent_model, (list, tuple)):
|
||||
parent_model = (parent_model,)
|
||||
|
||||
for child, parent in [(a, b) for a in child_model for b in parent_model]:
|
||||
assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_torch, slow, torch_device
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -444,7 +445,7 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
|
||||
if return_labels:
|
||||
if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
)
|
||||
|
||||
@@ -19,6 +19,7 @@ import unittest
|
||||
|
||||
from tests.test_modeling_common import floats_tensor
|
||||
from transformers import is_torch_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer
|
||||
from transformers.testing_utils import require_torch, slow, torch_device
|
||||
|
||||
@@ -458,7 +459,7 @@ class BigBirdModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
|
||||
if return_labels:
|
||||
if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
)
|
||||
|
||||
@@ -24,6 +24,7 @@ from typing import List, Tuple
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.file_utils import WEIGHTS_NAME
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_torch, require_torch_multi_gpu, slow, torch_device
|
||||
|
||||
|
||||
@@ -79,7 +80,7 @@ class ModelTesterMixin:
|
||||
|
||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
||||
inputs_dict = copy.deepcopy(inputs_dict)
|
||||
if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
|
||||
inputs_dict = {
|
||||
k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
|
||||
if isinstance(v, torch.Tensor) and v.ndim > 1
|
||||
@@ -88,9 +89,9 @@ class ModelTesterMixin:
|
||||
}
|
||||
|
||||
if return_labels:
|
||||
if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
|
||||
inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
|
||||
elif model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
|
||||
elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
|
||||
inputs_dict["start_positions"] = torch.zeros(
|
||||
self.model_tester.batch_size, dtype=torch.long, device=torch_device
|
||||
)
|
||||
@@ -98,18 +99,18 @@ class ModelTesterMixin:
|
||||
self.model_tester.batch_size, dtype=torch.long, device=torch_device
|
||||
)
|
||||
elif model_class in [
|
||||
*MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values(),
|
||||
*MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values(),
|
||||
*MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.values(),
|
||||
*get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
|
||||
*get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
|
||||
*get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
|
||||
]:
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
self.model_tester.batch_size, dtype=torch.long, device=torch_device
|
||||
)
|
||||
elif model_class in [
|
||||
*MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(),
|
||||
*MODEL_FOR_CAUSAL_LM_MAPPING.values(),
|
||||
*MODEL_FOR_MASKED_LM_MAPPING.values(),
|
||||
*MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(),
|
||||
*get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
|
||||
*get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
|
||||
*get_values(MODEL_FOR_MASKED_LM_MAPPING),
|
||||
*get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
|
||||
]:
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
@@ -229,7 +230,7 @@ class ModelTesterMixin:
|
||||
config.return_dict = True
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
if model_class in MODEL_MAPPING.values():
|
||||
if model_class in get_values(MODEL_MAPPING):
|
||||
continue
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
@@ -248,7 +249,7 @@ class ModelTesterMixin:
|
||||
config.return_dict = True
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
if model_class in MODEL_MAPPING.values():
|
||||
if model_class in get_values(MODEL_MAPPING):
|
||||
continue
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
@@ -312,7 +313,7 @@ class ModelTesterMixin:
|
||||
if "labels" in inputs_dict:
|
||||
correct_outlen += 1 # loss is added to beginning
|
||||
# Question Answering model returns start_logits and end_logits
|
||||
if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
|
||||
correct_outlen += 1 # start_logits and end_logits instead of only 1 output
|
||||
if "past_key_values" in outputs:
|
||||
correct_outlen += 1 # past_key_values have been returned
|
||||
|
||||
@@ -19,6 +19,7 @@ import unittest
|
||||
|
||||
from tests.test_modeling_common import floats_tensor
|
||||
from transformers import is_torch_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_torch, slow, torch_device
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -352,7 +353,7 @@ class ConvBertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
if "labels" in inputs_dict:
|
||||
correct_outlen += 1 # loss is added to beginning
|
||||
# Question Answering model returns start_logits and end_logits
|
||||
if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
|
||||
correct_outlen += 1 # start_logits and end_logits instead of only 1 output
|
||||
if "past_key_values" in outputs:
|
||||
correct_outlen += 1 # past_key_values have been returned
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_torch, slow, torch_device
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -292,7 +293,7 @@ class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
|
||||
if return_labels:
|
||||
if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
)
|
||||
|
||||
@@ -29,6 +29,7 @@ if is_flax_available():
|
||||
FlaxBertForNextSentencePrediction,
|
||||
FlaxBertForPreTraining,
|
||||
FlaxBertForQuestionAnswering,
|
||||
FlaxBertForSequenceClassification,
|
||||
FlaxBertForTokenClassification,
|
||||
FlaxBertModel,
|
||||
)
|
||||
@@ -125,6 +126,7 @@ class FlaxBertModelTest(FlaxModelTesterMixin, unittest.TestCase):
|
||||
FlaxBertForMultipleChoice,
|
||||
FlaxBertForQuestionAnswering,
|
||||
FlaxBertForNextSentencePrediction,
|
||||
FlaxBertForSequenceClassification,
|
||||
FlaxBertForTokenClassification,
|
||||
FlaxBertForQuestionAnswering,
|
||||
)
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
import unittest
|
||||
|
||||
from transformers import FunnelTokenizer, is_torch_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -365,7 +366,7 @@ class FunnelModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
|
||||
if return_labels:
|
||||
if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
)
|
||||
|
||||
@@ -21,6 +21,7 @@ import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.file_utils import cached_property
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -412,7 +413,7 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
if "labels" in inputs_dict:
|
||||
correct_outlen += 1 # loss is added to beginning
|
||||
# Question Answering model returns start_logits and end_logits
|
||||
if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
|
||||
correct_outlen += 1 # start_logits and end_logits instead of only 1 output
|
||||
if "past_key_values" in outputs:
|
||||
correct_outlen += 1 # past_key_values have been returned
|
||||
|
||||
@@ -18,6 +18,7 @@ import copy
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_torch, slow, torch_device
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -532,11 +533,11 @@ class LxmertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
inputs_dict = copy.deepcopy(inputs_dict)
|
||||
|
||||
if return_labels:
|
||||
if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
self.model_tester.batch_size, dtype=torch.long, device=torch_device
|
||||
)
|
||||
elif model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
|
||||
elif model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
|
||||
# special case for models like BERT that use multi-loss training for PreTraining
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
|
||||
@@ -21,6 +21,7 @@ import os
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -291,7 +292,7 @@ class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
|
||||
if return_labels:
|
||||
if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
)
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -272,7 +273,7 @@ class MobileBertModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
|
||||
if return_labels:
|
||||
if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
)
|
||||
|
||||
@@ -32,6 +32,7 @@ from transformers import (
|
||||
is_torch_available,
|
||||
)
|
||||
from transformers.file_utils import cached_property
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_scatter, require_torch, slow, torch_device
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -425,7 +426,7 @@ class TapasModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
|
||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
|
||||
inputs_dict = copy.deepcopy(inputs_dict)
|
||||
if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
|
||||
inputs_dict = {
|
||||
k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
|
||||
if isinstance(v, torch.Tensor) and v.ndim > 1
|
||||
@@ -434,9 +435,9 @@ class TapasModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
}
|
||||
|
||||
if return_labels:
|
||||
if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
|
||||
if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
|
||||
inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
|
||||
elif model_class in MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING.values():
|
||||
elif model_class in get_values(MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING):
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
)
|
||||
@@ -457,17 +458,17 @@ class TapasModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
self.model_tester.batch_size, dtype=torch.float, device=torch_device
|
||||
)
|
||||
elif model_class in [
|
||||
*MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values(),
|
||||
*MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values(),
|
||||
*get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
|
||||
*get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
|
||||
]:
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
self.model_tester.batch_size, dtype=torch.long, device=torch_device
|
||||
)
|
||||
elif model_class in [
|
||||
*MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(),
|
||||
*MODEL_FOR_CAUSAL_LM_MAPPING.values(),
|
||||
*MODEL_FOR_MASKED_LM_MAPPING.values(),
|
||||
*MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(),
|
||||
*get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
|
||||
*get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
|
||||
*get_values(MODEL_FOR_MASKED_LM_MAPPING),
|
||||
*get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
|
||||
]:
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
import unittest
|
||||
|
||||
from transformers import AlbertConfig, is_tf_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_tf, slow
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -249,7 +250,7 @@ class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
|
||||
if return_labels:
|
||||
if model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.values():
|
||||
if model_class in get_values(TF_MODEL_FOR_PRETRAINING_MAPPING):
|
||||
inputs_dict["sentence_order_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
|
||||
|
||||
return inputs_dict
|
||||
|
||||
@@ -13,7 +13,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import copy
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import is_tf_available
|
||||
@@ -39,6 +40,8 @@ if is_tf_available():
|
||||
TFBertForQuestionAnswering,
|
||||
TFBertForSequenceClassification,
|
||||
TFBertModel,
|
||||
TFFunnelBaseModel,
|
||||
TFFunnelModel,
|
||||
TFGPT2LMHeadModel,
|
||||
TFRobertaForMaskedLM,
|
||||
TFT5ForConditionalGeneration,
|
||||
@@ -176,6 +179,21 @@ class TFAutoModelTest(unittest.TestCase):
|
||||
self.assertEqual(model.num_parameters(), 14410)
|
||||
self.assertEqual(model.num_parameters(only_trainable=True), 14410)
|
||||
|
||||
def test_from_pretrained_with_tuple_values(self):
|
||||
# For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
|
||||
model = TFAutoModel.from_pretrained("sgugger/funnel-random-tiny")
|
||||
self.assertIsInstance(model, TFFunnelModel)
|
||||
|
||||
config = copy.deepcopy(model.config)
|
||||
config.architectures = ["FunnelBaseModel"]
|
||||
model = TFAutoModel.from_config(config)
|
||||
self.assertIsInstance(model, TFFunnelBaseModel)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model.save_pretrained(tmp_dir)
|
||||
model = TFAutoModel.from_pretrained(tmp_dir)
|
||||
self.assertIsInstance(model, TFFunnelBaseModel)
|
||||
|
||||
def test_parents_and_children_in_mappings(self):
|
||||
# Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
|
||||
# by the parents and will return the wrong configuration type when using auto models
|
||||
@@ -197,4 +215,12 @@ class TFAutoModelTest(unittest.TestCase):
|
||||
for parent_config, parent_model in mapping[: index + 1]:
|
||||
with self.subTest(msg=f"Testing if {child_config.__name__} is child of {parent_config.__name__}"):
|
||||
self.assertFalse(issubclass(child_config, parent_config))
|
||||
self.assertFalse(issubclass(child_model, parent_model))
|
||||
|
||||
# Tuplify child_model and parent_model since some of them could be tuples.
|
||||
if not isinstance(child_model, (list, tuple)):
|
||||
child_model = (child_model,)
|
||||
if not isinstance(parent_model, (list, tuple)):
|
||||
parent_model = (parent_model,)
|
||||
|
||||
for child, parent in [(a, b) for a in child_model for b in parent_model]:
|
||||
assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
import unittest
|
||||
|
||||
from transformers import BertConfig, is_tf_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_tf, slow
|
||||
|
||||
from .test_configuration_common import ConfigTester
|
||||
@@ -282,7 +283,7 @@ class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
|
||||
|
||||
if return_labels:
|
||||
if model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.values():
|
||||
if model_class in get_values(TF_MODEL_FOR_PRETRAINING_MAPPING):
|
||||
inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
|
||||
|
||||
return inputs_dict
|
||||
|
||||
@@ -25,6 +25,7 @@ from importlib import import_module
|
||||
from typing import List, Tuple
|
||||
|
||||
from transformers import is_tf_available
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import (
|
||||
_tf_gpu_memory_limit,
|
||||
is_pt_tf_cross_test,
|
||||
@@ -89,7 +90,7 @@ class TFModelTesterMixin:
|
||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
|
||||
inputs_dict = copy.deepcopy(inputs_dict)
|
||||
|
||||
if model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
|
||||
if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
|
||||
inputs_dict = {
|
||||
k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
|
||||
if isinstance(v, tf.Tensor) and v.ndim > 0
|
||||
@@ -98,21 +99,21 @@ class TFModelTesterMixin:
|
||||
}
|
||||
|
||||
if return_labels:
|
||||
if model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
|
||||
if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
|
||||
inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
|
||||
elif model_class in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
|
||||
elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
|
||||
inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
|
||||
inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
|
||||
elif model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values():
|
||||
elif model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
|
||||
inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
|
||||
elif model_class in TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values():
|
||||
elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
|
||||
inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
|
||||
elif model_class in [
|
||||
*TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(),
|
||||
*TF_MODEL_FOR_CAUSAL_LM_MAPPING.values(),
|
||||
*TF_MODEL_FOR_MASKED_LM_MAPPING.values(),
|
||||
*TF_MODEL_FOR_PRETRAINING_MAPPING.values(),
|
||||
*TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(),
|
||||
*get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
|
||||
*get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
|
||||
*get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
|
||||
*get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
|
||||
*get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
|
||||
]:
|
||||
inputs_dict["labels"] = tf.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
|
||||
@@ -580,7 +581,7 @@ class TFModelTesterMixin:
|
||||
),
|
||||
"input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
|
||||
}
|
||||
elif model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
|
||||
elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
|
||||
input_ids = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
|
||||
else:
|
||||
input_ids = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
|
||||
@@ -796,9 +797,9 @@ class TFModelTesterMixin:
|
||||
def test_model_common_attributes(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
list_lm_models = (
|
||||
list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.values())
|
||||
+ list(TF_MODEL_FOR_MASKED_LM_MAPPING.values())
|
||||
+ list(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values())
|
||||
get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
|
||||
+ get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
|
||||
+ get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
|
||||
)
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
@@ -1128,7 +1129,7 @@ class TFModelTesterMixin:
|
||||
]
|
||||
loss_size = tf.size(added_label)
|
||||
|
||||
if model.__class__ in TF_MODEL_FOR_CAUSAL_LM_MAPPING.values():
|
||||
if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING):
|
||||
# if loss is causal lm loss, labels are shift, so that one label per batch
|
||||
# is cut
|
||||
loss_size = loss_size - self.model_tester.batch_size
|
||||
|
||||
Reference in New Issue
Block a user