gpt2 and t5 parallel modeling (#8696)
* gpt2 and t5 parallel modeling * model_parallel utils update * adding missing model_parallel_utils Adds missing model_parallel_utils and reverses the changes to code in modeling_gpt2 and modeling_t5 * training_args reformat Reformatted training_args * style formatting Style formatting doc string length on training_args and model_parallel_utils * style changes make style && make quality for training_args and model_parallel_utils. * adding tests * minor change in trainer reverts loss calculation * Update training_args.py * Update training_args.py added back docstring language for adam_beta1 and adam_beta2 * Update trainer.py * Update src/transformers/trainer.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Fix style & rebase Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: LysandreJik <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
@@ -241,7 +241,11 @@ class Trainer:
|
||||
self.hp_name = None
|
||||
if model is None and model_init is not None:
|
||||
model = self.call_model_init()
|
||||
self.model = model.to(args.device) if model is not None else None
|
||||
# Model parallel
|
||||
if not self.args.model_parallel:
|
||||
self.model = model.to(args.device) if model is not None else None
|
||||
else:
|
||||
self.model = model if model is not None else None
|
||||
default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer)
|
||||
self.data_collator = data_collator if data_collator is not None else default_collator
|
||||
self.train_dataset = train_dataset
|
||||
@@ -578,7 +582,8 @@ class Trainer:
|
||||
|
||||
model = self.call_model_init(trial)
|
||||
|
||||
self.model = model.to(self.args.device)
|
||||
if not self.args.model_parallel:
|
||||
self.model = model.to(self.args.device)
|
||||
|
||||
# Reinitializes optimizer and scheduler
|
||||
self.optimizer, self.lr_scheduler = None, None
|
||||
@@ -625,7 +630,7 @@ class Trainer:
|
||||
model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level)
|
||||
|
||||
# Multi-gpu training (should be after apex fp16 initialization)
|
||||
if self.args.n_gpu > 1:
|
||||
if self.args.n_gpu > 1 and not self.args.model_parallel:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
@@ -805,7 +810,8 @@ class Trainer:
|
||||
)
|
||||
if isinstance(model, PreTrainedModel):
|
||||
self.model = model.from_pretrained(self.state.best_model_checkpoint)
|
||||
self.model = self.model.to(self.args.device)
|
||||
if not self.args.model_parallel:
|
||||
self.model = self.model.to(self.args.device)
|
||||
else:
|
||||
state_dict = torch.load(os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME))
|
||||
self.model.load_state_dict(state_dict)
|
||||
@@ -1323,7 +1329,7 @@ class Trainer:
|
||||
|
||||
model = self.model
|
||||
# multi-gpu eval
|
||||
if self.args.n_gpu > 1:
|
||||
if self.args.n_gpu > 1 and not self.args.model_parallel:
|
||||
model = torch.nn.DataParallel(model)
|
||||
# Note: in torch.distributed mode, there's no point in wrapping the model
|
||||
# inside a DistributedDataParallel as we'll be under `no_grad` anyways.
|
||||
|
||||
Reference in New Issue
Block a user