Merge pull request #604 from samuelbroscheit/master
Fixing issue "Training beyond specified 't_total' steps with schedule 'warmup_linear'" reported in #556
This commit is contained in:
@@ -25,6 +25,7 @@ import random
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import math
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
@@ -735,15 +736,6 @@ def main():
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
train_examples = None
|
||||
num_train_optimization_steps = None
|
||||
if args.do_train:
|
||||
train_examples = processor.get_train_examples(args.data_dir)
|
||||
num_train_optimization_steps = int(
|
||||
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
|
||||
if args.local_rank != -1:
|
||||
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||
|
||||
# Prepare model
|
||||
cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))
|
||||
model = BertForSequenceClassification.from_pretrained(args.bert_model,
|
||||
@@ -762,8 +754,35 @@ def main():
|
||||
elif n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Prepare optimizer
|
||||
if args.do_train:
|
||||
|
||||
# Prepare data loader
|
||||
|
||||
train_examples = processor.get_train_examples(args.data_dir)
|
||||
train_features = convert_examples_to_features(
|
||||
train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
|
||||
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
|
||||
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
|
||||
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
|
||||
|
||||
if output_mode == "classification":
|
||||
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
|
||||
elif output_mode == "regression":
|
||||
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
|
||||
|
||||
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
|
||||
if args.local_rank == -1:
|
||||
train_sampler = RandomSampler(train_data)
|
||||
else:
|
||||
train_sampler = DistributedSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
if args.local_rank != -1:
|
||||
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||
|
||||
# Prepare optimizer
|
||||
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
@@ -797,28 +816,11 @@ def main():
|
||||
global_step = 0
|
||||
nb_tr_steps = 0
|
||||
tr_loss = 0
|
||||
if args.do_train:
|
||||
train_features = convert_examples_to_features(
|
||||
train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
|
||||
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_examples))
|
||||
logger.info(" Batch size = %d", args.train_batch_size)
|
||||
logger.info(" Num steps = %d", num_train_optimization_steps)
|
||||
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
|
||||
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
|
||||
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
|
||||
|
||||
if output_mode == "classification":
|
||||
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
|
||||
elif output_mode == "regression":
|
||||
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
|
||||
|
||||
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
|
||||
if args.local_rank == -1:
|
||||
train_sampler = RandomSampler(train_data)
|
||||
else:
|
||||
train_sampler = DistributedSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
model.train()
|
||||
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||
|
||||
@@ -190,7 +190,7 @@ def main():
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
|
||||
num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
|
||||
optimizer = OpenAIAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
|
||||
@@ -894,16 +894,6 @@ def main():
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
train_examples = None
|
||||
num_train_optimization_steps = None
|
||||
if args.do_train:
|
||||
train_examples = read_squad_examples(
|
||||
input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
|
||||
num_train_optimization_steps = int(
|
||||
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
|
||||
if args.local_rank != -1:
|
||||
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||
|
||||
# Prepare model
|
||||
model = BertForQuestionAnswering.from_pretrained(args.bert_model,
|
||||
cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
|
||||
@@ -921,8 +911,47 @@ def main():
|
||||
elif n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Prepare optimizer
|
||||
if args.do_train:
|
||||
|
||||
# Prepare data loader
|
||||
|
||||
train_examples = read_squad_examples(
|
||||
input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
|
||||
cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
|
||||
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
|
||||
try:
|
||||
with open(cached_train_features_file, "rb") as reader:
|
||||
train_features = pickle.load(reader)
|
||||
except:
|
||||
train_features = convert_examples_to_features(
|
||||
examples=train_examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=args.max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=args.max_query_length,
|
||||
is_training=True)
|
||||
if args.local_rank == -1 or torch.distributed.get_rank() == 0:
|
||||
logger.info(" Saving train features into cached file %s", cached_train_features_file)
|
||||
with open(cached_train_features_file, "wb") as writer:
|
||||
pickle.dump(train_features, writer)
|
||||
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
|
||||
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
|
||||
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
|
||||
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
|
||||
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
|
||||
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
||||
all_start_positions, all_end_positions)
|
||||
if args.local_rank == -1:
|
||||
train_sampler = RandomSampler(train_data)
|
||||
else:
|
||||
train_sampler = DistributedSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
if args.local_rank != -1:
|
||||
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||
|
||||
# Prepare optimizer
|
||||
|
||||
param_optimizer = list(model.named_parameters())
|
||||
|
||||
# hack to remove pooler, which is not used
|
||||
@@ -959,42 +988,12 @@ def main():
|
||||
t_total=num_train_optimization_steps)
|
||||
|
||||
global_step = 0
|
||||
if args.do_train:
|
||||
cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
|
||||
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
|
||||
train_features = None
|
||||
try:
|
||||
with open(cached_train_features_file, "rb") as reader:
|
||||
train_features = pickle.load(reader)
|
||||
except:
|
||||
train_features = convert_examples_to_features(
|
||||
examples=train_examples,
|
||||
tokenizer=tokenizer,
|
||||
max_seq_length=args.max_seq_length,
|
||||
doc_stride=args.doc_stride,
|
||||
max_query_length=args.max_query_length,
|
||||
is_training=True)
|
||||
if args.local_rank == -1 or torch.distributed.get_rank() == 0:
|
||||
logger.info(" Saving train features into cached file %s", cached_train_features_file)
|
||||
with open(cached_train_features_file, "wb") as writer:
|
||||
pickle.dump(train_features, writer)
|
||||
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num orig examples = %d", len(train_examples))
|
||||
logger.info(" Num split examples = %d", len(train_features))
|
||||
logger.info(" Batch size = %d", args.train_batch_size)
|
||||
logger.info(" Num steps = %d", num_train_optimization_steps)
|
||||
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
|
||||
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
|
||||
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
|
||||
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
|
||||
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
|
||||
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
|
||||
all_start_positions, all_end_positions)
|
||||
if args.local_rank == -1:
|
||||
train_sampler = RandomSampler(train_data)
|
||||
else:
|
||||
train_sampler = DistributedSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
model.train()
|
||||
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||
|
||||
@@ -358,15 +358,6 @@ def main():
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
train_examples = None
|
||||
num_train_optimization_steps = None
|
||||
if args.do_train:
|
||||
train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
|
||||
num_train_optimization_steps = int(
|
||||
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
|
||||
if args.local_rank != -1:
|
||||
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||
|
||||
# Prepare model
|
||||
model = BertForMultipleChoice.from_pretrained(args.bert_model,
|
||||
cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)),
|
||||
@@ -384,8 +375,30 @@ def main():
|
||||
elif n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Prepare optimizer
|
||||
if args.do_train:
|
||||
|
||||
# Prepare data loader
|
||||
|
||||
train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
|
||||
train_features = convert_examples_to_features(
|
||||
train_examples, tokenizer, args.max_seq_length, True)
|
||||
all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
|
||||
all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
|
||||
all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
|
||||
all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
|
||||
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
|
||||
if args.local_rank == -1:
|
||||
train_sampler = RandomSampler(train_data)
|
||||
else:
|
||||
train_sampler = DistributedSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
||||
if args.local_rank != -1:
|
||||
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||
|
||||
# Prepare optimizer
|
||||
|
||||
param_optimizer = list(model.named_parameters())
|
||||
|
||||
# hack to remove pooler, which is not used
|
||||
@@ -421,23 +434,11 @@ def main():
|
||||
t_total=num_train_optimization_steps)
|
||||
|
||||
global_step = 0
|
||||
if args.do_train:
|
||||
train_features = convert_examples_to_features(
|
||||
train_examples, tokenizer, args.max_seq_length, True)
|
||||
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_examples))
|
||||
logger.info(" Batch size = %d", args.train_batch_size)
|
||||
logger.info(" Num steps = %d", num_train_optimization_steps)
|
||||
all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
|
||||
all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
|
||||
all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
|
||||
all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
|
||||
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
|
||||
if args.local_rank == -1:
|
||||
train_sampler = RandomSampler(train_data)
|
||||
else:
|
||||
train_sampler = DistributedSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
model.train()
|
||||
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||
|
||||
Reference in New Issue
Block a user