From 68ab9599ce3aefbd25d1c81e3315d1968849b628 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 19 Jun 2019 09:38:38 +0200
Subject: [PATCH] small fix and updates to readme

---
 README.md                           | 24 ++++++++++++++++++----
 examples/bertology.py               | 31 +++++++++++++++++++----------
 examples/run_classifier.py          |  6 +++++-
 examples/run_squad.py               |  4 ++++
 pytorch_pretrained_bert/modeling.py |  6 ++++--
 5 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index b0a155f140..a48f8e3cf5 100644
--- a/README.md
+++ b/README.md
@@ -1322,12 +1322,14 @@ python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/pre
 {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
 ```
 
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 93 on SQuAD:
+**distributed training**
+
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
 
 ```bash
 python -m torch.distributed.launch --nproc_per_node=8 \
  run_squad.py \
- --bert_model bert-large-cased-whole-word-masking  \
+ --bert_model bert-large-uncased-whole-word-masking  \
  --do_train \
  --do_predict \
  --do_lower_case \
@@ -1337,17 +1339,31 @@ python -m torch.distributed.launch --nproc_per_node=8 \
  --num_train_epochs 2 \
  --max_seq_length 384 \
  --doc_stride 128 \
- --output_dir ../models/train_squad_large_cased_wwm/ \
+ --output_dir ../models/wwm_uncased_finetuned_squad/ \
  --train_batch_size 24 \
  --gradient_accumulation_steps 12
 ```
 
 Training with these hyper-parameters gave us the following results:
 ```bash
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/train_squad_large_cased_wwm/predictions.json
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
 {"exact_match": 86.91579943235573, "f1": 93.1532499015869}
 ```
 
+This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-squad`.
+
+And here is the model provided as `bert-large-cased-whole-word-masking-finetuned-squad`:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8  run_squad.py  --bert_model bert-large-cased-whole-word-masking   --do_train  --do_predict  --do_lower_case  --train_file $SQUAD_DIR/train-v1.1.json  --predict_file $SQUAD_DIR/dev-v1.1.json  --learning_rate 3e-5  --num_train_epochs 2  --max_seq_length 384  --doc_stride 128  --output_dir ../models/wwm_cased_finetuned_squad/  --train_batch_size 24  --gradient_accumulation_steps 12
+```
+
+Training with these hyper-parameters gave us the following results:
+```bash
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
+{"exact_match": 84.18164616840113, "f1": 91.58645594850135}
+```
+
 #### SWAG
 
 The data for SWAG can be downloaded by cloning the following [repository](https://github.com/rowanz/swagaf)
diff --git a/examples/bertology.py b/examples/bertology.py
index 7db2f9e51e..b7e73e30d4 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -8,7 +8,7 @@ import torch
 import torch.nn.functional as F
 import numpy as np
 
-from pytorch_pretrained_bert import BertModel, BertTokenizer
+from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
@@ -17,24 +17,33 @@ logger = logging.getLogger(__name__)
 
 def run_model():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name_or_path', type=str, default='bert-base-uncased',
-                                                help='pretrained model name or path to local checkpoint')
+    parser.add_argument('--model_name_or_path', type=str, default='bert-base-uncased', help='pretrained model name or path to local checkpoint')
     parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
     args = parser.parse_args()
-    print(args)
-
-    if args.batch_size == -1:
-        args.batch_size = 1
-    assert args.nsamples % args.batch_size == 0
 
     np.random.seed(args.seed)
     torch.random.manual_seed(args.seed)
     torch.cuda.manual_seed(args.seed)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    if args.local_rank == -1 or args.no_cuda:
+        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        args.device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+
+    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        args.device, n_gpu, bool(args.local_rank != -1), args.fp16))
 
     tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
-    model = BertModel.from_pretrained(args.model_name_or_path)
-    model.to(device)
+    model = BertForSequenceClassification.from_pretrained(args.model_name_or_path)
+    model.to(args.device)
     model.eval()
 
     
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index e708671e42..eda96f81e3 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -187,7 +187,7 @@ def main():
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir):
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(args.output_dir)
 
     task_name = args.task_name.lower()
@@ -361,6 +361,10 @@ def main():
         # Load a trained model and vocabulary that you have fine-tuned
         model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
         tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
     else:
         model = BertForSequenceClassification.from_pretrained(args.bert_model)
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 0d0f52e760..bf1763e884 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -331,6 +331,10 @@ def main():
         # Load a trained model and vocabulary that you have fine-tuned
         model = BertForQuestionAnswering.from_pretrained(args.output_dir)
         tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
     else:
         model = BertForQuestionAnswering.from_pretrained(args.bert_model)
 
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 4dfffb8e43..d7493f07ca 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -46,8 +46,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-base-uncased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-finetuned-mrpc-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-mrpc-pytorch_model.bin",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
 }
 PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
@@ -60,6 +59,9 @@ PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
 }
 BERT_CONFIG_NAME = 'bert_config.json'
 TF_WEIGHTS_NAME = 'model.ckpt'