From 70c10caa06d9feda3f446d0a82655f56cd2afdab Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 5 Aug 2019 17:09:37 +0200 Subject: [PATCH] add option mentioned in #940 --- examples/run_glue.py | 6 ++++++ examples/run_squad.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/examples/run_glue.py b/examples/run_glue.py index 0d4ffaa390..a939ea373b 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -247,6 +247,9 @@ def evaluate(args, model, tokenizer, prefix=""): def load_and_cache_examples(args, task, tokenizer, evaluate=False): + if args.local_rank not in [-1, 0]: + torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache + processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file @@ -273,6 +276,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) + if args.local_rank == 0: + torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache + # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) diff --git a/examples/run_squad.py b/examples/run_squad.py index 7d768d2c43..e62a1f1ff3 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -272,6 +272,9 @@ def evaluate(args, model, tokenizer, prefix=""): def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): + if args.local_rank not in [-1, 0]: + torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache + # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format( @@ -296,6 +299,9 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) + if args.local_rank == 0: + torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache + # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)