Merge branch 'master' into auto_models

2019-08-05 19:17:35 +02:00
parent 0b524b0848 3a126e73dd
commit d43dc48b34
16 changed files with 340 additions and 108 deletions
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -92,6 +92,10 @@ def train(args, train_dataset, model, tokenizer):
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
@@ -243,6 +247,9 @@ def evaluate(args, model, tokenizer, prefix=""):


 def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
@@ -269,6 +276,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
@@ -418,8 +428,6 @@ def main():
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)

    logger.info("Training/evaluation parameters %s", args)

--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -101,6 +101,10 @@ def train(args, train_dataset, model, tokenizer):
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
@@ -241,7 +245,10 @@ def evaluate(args, model, tokenizer, prefix=""):
    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
-    output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
+    if args.version_2_with_negative:
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
+    else:
+        output_null_log_odds_file = None

    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
@@ -265,6 +272,9 @@ def evaluate(args, model, tokenizer, prefix=""):


 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
@@ -289,6 +299,9 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
@@ -457,8 +470,6 @@ def main():
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)

    logger.info("Training/evaluation parameters %s", args)