Black 20 release

This commit is contained in:
Lysandre
2020-08-26 17:20:22 +02:00
parent e78c110338
commit a75c64d80c
191 changed files with 4807 additions and 3503 deletions

View File

@@ -173,7 +173,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
# Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
model,
device_ids=[args.local_rank],
output_device=args.local_rank,
find_unused_parameters=True,
)
# Train!
@@ -217,7 +220,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
epochs_trained,
int(args.num_train_epochs),
desc="Epoch",
disable=args.local_rank not in [-1, 0],
)
set_seed(args) # Added here for reproductibility
for _ in train_iterator:
@@ -280,11 +286,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
attention_mask=inputs["attention_mask"],
)
loss_logits = F.kl_div(
input=F.log_softmax(logits_stu / args.temperature, dim=-1),
target=F.softmax(logits_tea / args.temperature, dim=-1),
reduction="batchmean",
) * (args.temperature ** 2)
loss_logits = (
F.kl_div(
input=F.log_softmax(logits_stu / args.temperature, dim=-1),
target=F.softmax(logits_tea / args.temperature, dim=-1),
reduction="batchmean",
)
* (args.temperature ** 2)
)
loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
@@ -529,7 +538,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
)
features = convert_examples_to_features(
examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode,
examples,
tokenizer,
max_length=args.max_seq_length,
label_list=label_list,
output_mode=output_mode,
)
if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file)
@@ -592,7 +605,10 @@ def main():
)
# Other parameters
parser.add_argument(
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
"--config_name",
default="",
type=str,
help="Pretrained config name or path if not the same as model_name",
)
parser.add_argument(
"--tokenizer_name",
@@ -616,17 +632,27 @@ def main():
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
parser.add_argument(
"--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
"--evaluate_during_training",
action="store_true",
help="Run evaluation during training at each logging step.",
)
parser.add_argument(
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
"--do_lower_case",
action="store_true",
help="Set this flag if you are using an uncased model.",
)
parser.add_argument(
"--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
"--per_gpu_train_batch_size",
default=8,
type=int,
help="Batch size per GPU/CPU for training.",
)
parser.add_argument(
"--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
"--per_gpu_eval_batch_size",
default=8,
type=int,
help="Batch size per GPU/CPU for evaluation.",
)
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
@@ -723,7 +749,10 @@ def main():
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
"--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.",
)
parser.add_argument(
"--max_steps",
@@ -742,10 +771,14 @@ def main():
)
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
parser.add_argument(
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
"--overwrite_output_dir",
action="store_true",
help="Overwrite the content of the output directory",
)
parser.add_argument(
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
"--overwrite_cache",
action="store_true",
help="Overwrite the cached training and evaluation sets",
)
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")