From 7b75aa9fa55bee577e2c7403301ed31103125a35 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 8 May 2020 14:10:05 -0400
Subject: [PATCH] [TPU] Doc, fix xla_spawn.py, only preprocess dataset once
 (#4223)

* [TPU] Doc, fix xla_spawn.py, only preprocess dataset once

* Update examples/README.md

* [xla_spawn] Add `_mp_fn` to other Trainer scripts

* [TPU] Fix: eval dataloader was None
---
 examples/README.md                            | 26 ++++++++++++++++-
 examples/bertology/run_bertology.py           |  2 +-
 .../run_language_modeling.py                  |  5 ++++
 .../multiple-choice/run_multiple_choice.py    |  5 ++++
 examples/text-classification/README.md        | 12 ++++----
 examples/text-classification/run_glue.py      | 16 ++---------
 examples/token-classification/run_ner.py      |  5 ++++
 examples/xla_spawn.py                         | 12 ++++----
 src/transformers/data/datasets/glue.py        | 24 ++++++++--------
 src/transformers/trainer.py                   | 28 ++++++++++++++-----
 10 files changed, 88 insertions(+), 47 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index b42e0cd534..0645c1e1b6 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -53,4 +53,28 @@ pip install -r ./examples/requirements.txt
 
 ## Running on TPUs
 
-Documentation to come.
+When using Tensorflow, TPUs are supported out of the box as a `tf.distribute.Strategy`.
+
+When using PyTorch, we support TPUs thanks to `pytorch/xla`. For more context and information on how to setup your TPU environment refer to Google's documentation and to the
+very detailed [pytorch/xla README](https://github.com/pytorch/xla/blob/master/README.md).
+
+In this repo, we provide a very simple launcher script named [xla_spawn.py](./xla_spawn.py) that lets you run our example scripts on multiple TPU cores without any boilerplate.
+Just pass a `--num_cores` flag to this script, then your regular training script with its arguments (this is similar to the `torch.distributed.launch` helper for torch.distributed).
+
+For example for `run_glue`:
+
+```bash
+python examples/xla_spawn.py --num_cores 8 \
+	examples/text-classification/run_glue.py
+	--model_name_or_path bert-base-cased \
+	--task_name mnli \
+	--data_dir ./data/glue_data/MNLI \
+	--output_dir ./models/tpu \
+	--overwrite_output_dir \
+	--do_train \
+	--do_eval \
+	--num_train_epochs 1 \
+	--save_steps 20000
+```
+
+Feedback and more use cases and benchmarks involving TPUs are welcome, please share with the community.
diff --git a/examples/bertology/run_bertology.py b/examples/bertology/run_bertology.py
index 2904358f90..8d26bf890c 100644
--- a/examples/bertology/run_bertology.py
+++ b/examples/bertology/run_bertology.py
@@ -404,7 +404,7 @@ def main():
     logger.info("Training/evaluation parameters %s", args)
 
     # Prepare dataset for the GLUE task
-    eval_dataset = GlueDataset(args, tokenizer=tokenizer, evaluate=True, local_rank=args.local_rank)
+    eval_dataset = GlueDataset(args, tokenizer=tokenizer, evaluate=True)
     if args.data_subset > 0:
         eval_dataset = Subset(eval_dataset, list(range(min(args.data_subset, len(eval_dataset)))))
     eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
diff --git a/examples/language-modeling/run_language_modeling.py b/examples/language-modeling/run_language_modeling.py
index 660a6520cb..1034f2dc9e 100644
--- a/examples/language-modeling/run_language_modeling.py
+++ b/examples/language-modeling/run_language_modeling.py
@@ -280,5 +280,10 @@ def main():
     return results
 
 
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/multiple-choice/run_multiple_choice.py b/examples/multiple-choice/run_multiple_choice.py
index 7a4966f00e..6e41b3d9c9 100644
--- a/examples/multiple-choice/run_multiple_choice.py
+++ b/examples/multiple-choice/run_multiple_choice.py
@@ -221,5 +221,10 @@ def main():
     return results
 
 
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/text-classification/README.md b/examples/text-classification/README.md
index 3e2679b344..7fbf744381 100644
--- a/examples/text-classification/README.md
+++ b/examples/text-classification/README.md
@@ -85,10 +85,12 @@ CoLA, SST-2. The following section provides details on how to run half-precision
 said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well,
 since the data processor for each task inherits from the base class DataProcessor.
 
-## Running on TPUs
+## Running on TPUs in PyTorch
 
-You can accelerate your workloads on Google's TPUs. For information on how to setup your TPU environment refer to this
-[README](https://github.com/pytorch/xla/blob/master/README.md).
+**Update**: read the more up-to-date [Running on TPUs](../README.md#running-on-tpus) in the main README.md instead.
+
+Even when running PyTorch, you can accelerate your workloads on Google's TPUs, using `pytorch/xla`. For information on how to setup your TPU environment refer to the
+[pytorch/xla README](https://github.com/pytorch/xla/blob/master/README.md).
 
 The following are some examples of running the `*_tpu.py` finetuning scripts on TPUs. All steps for data preparation are
 identical to your normal GPU + Huggingface setup.
@@ -101,7 +103,6 @@ export GLUE_DIR=/path/to/glue
 export TASK_NAME=MNLI
 
 python run_glue_tpu.py \
-  --model_type bert \
   --model_name_or_path bert-base-cased \
   --task_name $TASK_NAME \
   --do_train \
@@ -115,8 +116,7 @@ python run_glue_tpu.py \
   --overwrite_output_dir \
   --logging_steps 50 \
   --save_steps 200 \
-  --num_cores=8 \
-  --only_log_master
+  --num_cores=8
 ```
 
 ### MRPC
diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py
index fd568af107..9bfe6aa288 100644
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -134,16 +134,8 @@ def main():
     )
 
     # Get datasets
-    train_dataset = (
-        GlueDataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank)
-        if training_args.do_train
-        else None
-    )
-    eval_dataset = (
-        GlueDataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True)
-        if training_args.do_eval
-        else None
-    )
+    train_dataset = GlueDataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
+    eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None
 
     def compute_metrics(p: EvalPrediction) -> Dict:
         if output_mode == "classification":
@@ -181,9 +173,7 @@ def main():
         eval_datasets = [eval_dataset]
         if data_args.task_name == "mnli":
             mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm")
-            eval_datasets.append(
-                GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True)
-            )
+            eval_datasets.append(GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, evaluate=True))
 
         for eval_dataset in eval_datasets:
             result = trainer.evaluate(eval_dataset=eval_dataset)
diff --git a/examples/token-classification/run_ner.py b/examples/token-classification/run_ner.py
index 9b664cf3cd..e70512b979 100644
--- a/examples/token-classification/run_ner.py
+++ b/examples/token-classification/run_ner.py
@@ -292,5 +292,10 @@ def main():
     return results
 
 
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/xla_spawn.py b/examples/xla_spawn.py
index 460e5d83a0..0889e57afc 100644
--- a/examples/xla_spawn.py
+++ b/examples/xla_spawn.py
@@ -12,17 +12,13 @@ Inspired by https://github.com/pytorch/pytorch/blob/master/torch/distributed/lau
 
 
 import importlib
-import os
 import sys
 from argparse import REMAINDER, ArgumentParser
+from pathlib import Path
 
 import torch_xla.distributed.xla_multiprocessing as xmp
 
 
-def trim_suffix(s: str, suffix: str):
-    return s if not s.endswith(suffix) or len(suffix) == 0 else s[: -len(suffix)]
-
-
 def parse_args():
     """
     Helper function parsing the command line options
@@ -44,7 +40,7 @@ def parse_args():
         "training_script",
         type=str,
         help=(
-            "The full module name to the single TPU training "
+            "The full path to the single TPU training "
             "program/script to be launched in parallel, "
             "followed by all the arguments for the "
             "training script"
@@ -61,7 +57,9 @@ def main():
     args = parse_args()
 
     # Import training_script as a module.
-    mod_name = trim_suffix(os.path.basename(args.training_script), ".py")
+    script_fpath = Path(args.training_script)
+    sys.path.append(str(script_fpath.parent.resolve()))
+    mod_name = script_fpath.stem
     mod = importlib.import_module(mod_name)
 
     # Patch sys.argv
diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py
index 63d9b69af8..3c963c2da3 100644
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@@ -5,12 +5,12 @@ from dataclasses import dataclass, field
 from typing import List, Optional
 
 import torch
+from filelock import FileLock
 from torch.utils.data.dataset import Dataset
 
 from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_xlm_roberta import XLMRobertaTokenizer
-from ...trainer import torch_distributed_zero_first
 from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
 from ..processors.utils import InputFeatures
 
@@ -63,7 +63,6 @@ class GlueDataset(Dataset):
         tokenizer: PreTrainedTokenizer,
         limit_length: Optional[int] = None,
         evaluate=False,
-        local_rank=-1,
     ):
         self.args = args
         processor = glue_processors[args.task_name]()
@@ -75,9 +74,11 @@ class GlueDataset(Dataset):
                 "dev" if evaluate else "train", tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
             ),
         )
-        with torch_distributed_zero_first(local_rank):
-            # Make sure only the first process in distributed training processes the dataset,
-            # and the others will use the cache.
+
+        # Make sure only the first process in distributed training processes the dataset,
+        # and the others will use the cache.
+        lock_path = cached_features_file + ".lock"
+        with FileLock(lock_path):
 
             if os.path.exists(cached_features_file) and not args.overwrite_cache:
                 start = time.time()
@@ -109,13 +110,12 @@ class GlueDataset(Dataset):
                     label_list=label_list,
                     output_mode=self.output_mode,
                 )
-                if local_rank in [-1, 0]:
-                    start = time.time()
-                    torch.save(self.features, cached_features_file)
-                    # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
-                    logger.info(
-                        f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
-                    )
+                start = time.time()
+                torch.save(self.features, cached_features_file)
+                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
+                logger.info(
+                    f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
+                )
 
     def __len__(self):
         return len(self.features)
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 6d3e4f97b5..d07a7a8adf 100644
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -6,7 +6,7 @@ import re
 import shutil
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -195,10 +195,12 @@ class Trainer:
         if eval_dataset is None and self.eval_dataset is None:
             raise ValueError("Trainer: evaluation requires an eval_dataset.")
 
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
         sampler = get_tpu_sampler(eval_dataset) if is_tpu_available() else None
 
         data_loader = DataLoader(
-            eval_dataset if eval_dataset is not None else self.eval_dataset,
+            eval_dataset,
             sampler=sampler,
             batch_size=self.args.eval_batch_size,
             shuffle=False,
@@ -267,6 +269,16 @@ class Trainer:
         # keep track of model topology and gradients
         wandb.watch(self.model)
 
+    def num_examples(self, dataloader: Union[DataLoader, "pl.PerDeviceLoader"]) -> int:
+        """
+        Helper to get num of examples from a DataLoader, by accessing its Dataset.
+        """
+        if is_tpu_available():
+            assert isinstance(dataloader, pl.PerDeviceLoader)
+            return len(dataloader._loader._loader.dataset)
+        else:
+            return len(dataloader.dataset)
+
     def train(self, model_path: Optional[str] = None):
         """
         Main training entry point.
@@ -326,17 +338,15 @@ class Trainer:
 
         # Train!
         if is_tpu_available():
-            num_examples = len(train_dataloader._loader._loader.dataset)
             total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size()
         else:
-            num_examples = len(train_dataloader.dataset)
             total_train_batch_size = (
                 self.args.train_batch_size
                 * self.args.gradient_accumulation_steps
                 * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1)
             )
         logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", num_examples)
+        logger.info("  Num examples = %d", self.num_examples(train_dataloader))
         logger.info("  Num Epochs = %d", num_train_epochs)
         logger.info("  Instantaneous batch size per device = %d", self.args.per_gpu_train_batch_size)
         logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size)
@@ -606,9 +616,13 @@ class Trainer:
             model = self.model
         model.to(self.args.device)
 
+        if is_tpu_available():
+            batch_size = dataloader._loader._loader.batch_size
+        else:
+            batch_size = dataloader.batch_size
         logger.info("***** Running %s *****", description)
-        logger.info("  Num examples = %d", len(dataloader.dataset))
-        logger.info("  Batch size = %d", dataloader.batch_size)
+        logger.info("  Num examples = %d", self.num_examples(dataloader))
+        logger.info("  Batch size = %d", batch_size)
         eval_losses: List[float] = []
         preds: np.ndarray = None
         label_ids: np.ndarray = None