From 0ce2f496dc4578e0dc2be8ff02783600d23dc635 Mon Sep 17 00:00:00 2001
From: Yaser Martinez Palenzuela <yaser.martinez@gmail.com>
Date: Mon, 5 Nov 2018 22:34:12 +0100
Subject: [PATCH 01/28] Port tokenization for the multilingual model

---
 tokenization.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/tokenization.py b/tokenization.py
index 83bc86e444..8cf83720d9 100644
--- a/tokenization.py
+++ b/tokenization.py
@@ -133,6 +133,13 @@ class BasicTokenizer(object):
         """Tokenizes a piece of text."""
         text = convert_to_unicode(text)
         text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
         orig_tokens = whitespace_tokenize(text)
         split_tokens = []
         for token in orig_tokens:
@@ -174,7 +181,42 @@ class BasicTokenizer(object):
             i += 1
 
         return ["".join(x) for x in output]
+    
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
 
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+    
+        return False
+    
     def _clean_text(self, text):
         """Performs invalid character removal and whitespace cleanup on text."""
         output = []

From 4d124baf8f4706c6060d446b38f07c4258a91d97 Mon Sep 17 00:00:00 2001
From: Yaser Martinez Palenzuela <yaser.martinez@gmail.com>
Date: Mon, 5 Nov 2018 23:04:29 +0100
Subject: [PATCH 02/28] Add test for Chinese tokenization

---
 tests/tokenization_test.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/tokenization_test.py b/tests/tokenization_test.py
index 8d6ede9300..7c12ecccfe 100644
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
@@ -43,6 +43,13 @@ class TokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
+    def test_chinese(self):
+        tokenizer = tokenization.BasicTokenizer()
+    
+        self.assertListEqual(
+            tokenizer.tokenize(u"ah\u535A\u63A8zz"),
+            [u"ah", u"\u535A", u"\u63A8", u"zz"])  
+
     def test_basic_tokenizer_lower(self):
         tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
 

From 2a97fe220b5d7c8bf7809aaea1ae075a0c0ae53f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 6 Nov 2018 17:26:33 +0100
Subject: [PATCH 03/28] fixing weights initialization in the model and out of
 span clamping

---
 modeling.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/modeling.py b/modeling.py
index c467e8266e..0b8ac586e9 100644
--- a/modeling.py
+++ b/modeling.py
@@ -388,10 +388,10 @@ class BertForSequenceClassification(nn.Module):
             if isinstance(module, (nn.Linear, nn.Embedding)):
                 # Slightly different from the TF version which uses truncated_normal for initialization
                 # cf https://github.com/pytorch/pytorch/pull/5617
-                module.weight.data.normal_(config.initializer_range)
+                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
             elif isinstance(module, BERTLayerNorm):
-                module.beta.data.normal_(config.initializer_range)
-                module.gamma.data.normal_(config.initializer_range)
+                module.beta.data.normal_(mean=0.0, std=config.initializer_range)
+                module.gamma.data.normal_(mean=0.0, std=config.initializer_range)
             if isinstance(module, nn.Linear):
                 module.bias.data.zero_()
         self.apply(init_weights)
@@ -438,10 +438,10 @@ class BertForQuestionAnswering(nn.Module):
             if isinstance(module, (nn.Linear, nn.Embedding)):
                 # Slightly different from the TF version which uses truncated_normal for initialization
                 # cf https://github.com/pytorch/pytorch/pull/5617
-                module.weight.data.normal_(config.initializer_range)
+                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
             elif isinstance(module, BERTLayerNorm):
-                module.beta.data.normal_(config.initializer_range)
-                module.gamma.data.normal_(config.initializer_range)
+                module.beta.data.normal_(mean=0.0, std=config.initializer_range)
+                module.gamma.data.normal_(mean=0.0, std=config.initializer_range)
             if isinstance(module, nn.Linear):
                 module.bias.data.zero_()
         self.apply(init_weights)
@@ -459,7 +459,7 @@ class BertForQuestionAnswering(nn.Module):
             start_positions = start_positions.squeeze(-1)
             end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1) + 1
+            ignored_index = start_logits.size(1)
             start_positions.clamp_(0, ignored_index)
             end_positions.clamp_(0, ignored_index)
 

From a1126237a9980df8255054c472d2894d05ec7315 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 6 Nov 2018 17:31:15 +0100
Subject: [PATCH 04/28] clean up logits extraction logic

---
 run_squad.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/run_squad.py b/run_squad.py
index 8a69e057e5..50d450d85a 100644
--- a/run_squad.py
+++ b/run_squad.py
@@ -908,7 +908,7 @@ def main():
         model.eval()
         all_results = []
         logger.info("Start evaluating")
-        for input_ids, input_mask, segment_ids, example_index in tqdm(eval_dataloader, desc="Evaluating"):
+        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"):
             if len(all_results) % 1000 == 0:
                 logger.info("Processing example: %d" % (len(all_results)))
 
@@ -916,21 +916,18 @@ def main():
             input_mask = input_mask.to(device)
             segment_ids = segment_ids.to(device)
 
-            start_logits, end_logits = model(input_ids, segment_ids, input_mask)
+            with torch.no_grad():
+                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
 
-            unique_id = [int(eval_features[e.item()].unique_id) for e in example_index]
-            start_logits = [x.view(-1).detach().cpu().numpy() for x in start_logits]
-            end_logits = [x.view(-1).detach().cpu().numpy() for x in end_logits]
-            for idx, i in enumerate(unique_id):
-                s = [float(x) for x in start_logits[idx]]
-                e = [float(x) for x in end_logits[idx]]
-                all_results.append(
-                    RawResult(
-                        unique_id=i,
-                        start_logits=s,
-                        end_logits=e
-                    )
-                )
+            for i, example_index in enumerate(example_indices):
+                start_logits = batch_start_logits[i].detach().cpu().tolist()
+                end_logits = batch_end_logits[i].detach().cpu().tolist()
+
+                eval_feature = eval_features[example_index.item()]
+                unique_id = int(eval_feature.unique_id)
+                all_results.append(RawResult(unique_id=unique_id,
+                                             start_logits=start_logits,
+                                             end_logits=end_logits))
 
         output_prediction_file = os.path.join(args.output_dir, "predictions.json")
         output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")

From 6bb7510a50af4b736df296620fa58a77fea978e2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 7 Nov 2018 22:12:41 +0100
Subject: [PATCH 05/28] fixing pre-processing bug - averaging loss for gradient
 accumulation - no_grad on evaluation

---
 run_classifier.py | 21 +++++++++------------
 run_squad.py      | 45 +++++++++++++++------------------------------
 2 files changed, 24 insertions(+), 42 deletions(-)

diff --git a/run_classifier.py b/run_classifier.py
index b5290afd12..c19c6f9ac0 100644
--- a/run_classifier.py
+++ b/run_classifier.py
@@ -458,7 +458,6 @@ def main():
         raise ValueError("Task not found: %s" % (task_name))
 
     processor = processors[task_name]()
-
     label_list = processor.get_labels()
 
     tokenizer = tokenization.FullTokenizer(
@@ -518,20 +517,18 @@ def main():
         for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
             tr_loss = 0
             nb_tr_examples, nb_tr_steps = 0, 0
-            for step, (input_ids, input_mask, segment_ids, label_ids) in enumerate(tqdm(train_dataloader, desc="Iteration")):
-                input_ids = input_ids.to(device)
-                input_mask = input_mask.to(device)
-                segment_ids = segment_ids.to(device)
-                label_ids = label_ids.to(device)
-
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, label_ids = batch
                 loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                 if n_gpu > 1:
                     loss = loss.mean() # mean() to average on multi-gpu.
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+                loss.backward()
                 tr_loss += loss.item()
                 nb_tr_examples += input_ids.size(0)
                 nb_tr_steps += 1
-                loss.backward()
-
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     optimizer.step()    # We have accumulated enought gradients
                     model.zero_grad()
@@ -579,13 +576,13 @@ def main():
             nb_eval_examples += input_ids.size(0)
             nb_eval_steps += 1
 
-        eval_loss = eval_loss / nb_eval_steps #len(eval_dataloader)
-        eval_accuracy = eval_accuracy / nb_eval_examples #len(eval_dataloader)
+        eval_loss = eval_loss / nb_eval_steps
+        eval_accuracy = eval_accuracy / nb_eval_examples
 
         result = {'eval_loss': eval_loss,
                   'eval_accuracy': eval_accuracy,
                   'global_step': global_step,
-                  'loss': tr_loss/nb_tr_steps}#'loss': loss.item()}
+                  'loss': tr_loss/nb_tr_steps}
 
         output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
         with open(output_eval_file, "w") as writer:
diff --git a/run_squad.py b/run_squad.py
index 8a69e057e5..a25893e1d9 100644
--- a/run_squad.py
+++ b/run_squad.py
@@ -743,7 +743,7 @@ def main():
                         type=int,
                         default=1,
                         help="Number of updates steps to accumualte before performing a backward/update pass.")
-    
+
     args = parser.parse_args()
 
     if args.local_rank == -1 or args.no_cuda:
@@ -857,20 +857,13 @@ def main():
         model.train()
         for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
             for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+                batch = tuple(t.to(device) for t in batch)
                 input_ids, input_mask, segment_ids, start_positions, end_positions = batch
-                input_ids = input_ids.to(device)
-                input_mask = input_mask.to(device)
-                segment_ids = segment_ids.to(device)
-                start_positions = start_positions.to(device)
-                end_positions = start_positions.to(device)
-
-                start_positions = start_positions.view(-1, 1)
-                end_positions = end_positions.view(-1, 1)
-
                 loss, _ = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
                 if n_gpu > 1:
                     loss = loss.mean() # mean() to average on multi-gpu.
-
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
                 loss.backward()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
                     optimizer.step()    # We have accumulated enought gradients
@@ -908,30 +901,22 @@ def main():
         model.eval()
         all_results = []
         logger.info("Start evaluating")
-        for input_ids, input_mask, segment_ids, example_index in tqdm(eval_dataloader, desc="Evaluating"):
+        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"):
             if len(all_results) % 1000 == 0:
                 logger.info("Processing example: %d" % (len(all_results)))
-
             input_ids = input_ids.to(device)
             input_mask = input_mask.to(device)
             segment_ids = segment_ids.to(device)
-
-            start_logits, end_logits = model(input_ids, segment_ids, input_mask)
-
-            unique_id = [int(eval_features[e.item()].unique_id) for e in example_index]
-            start_logits = [x.view(-1).detach().cpu().numpy() for x in start_logits]
-            end_logits = [x.view(-1).detach().cpu().numpy() for x in end_logits]
-            for idx, i in enumerate(unique_id):
-                s = [float(x) for x in start_logits[idx]]
-                e = [float(x) for x in end_logits[idx]]
-                all_results.append(
-                    RawResult(
-                        unique_id=i,
-                        start_logits=s,
-                        end_logits=e
-                    )
-                )
-
+            with torch.no_grad():
+                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
+            for i, example_index in enumerate(example_indices):
+                start_logits = batch_start_logits[i].detach().cpu().tolist()
+                end_logits = batch_end_logits[i].detach().cpu().tolist()
+                eval_feature = eval_features[example_index.item()]
+                unique_id = int(eval_feature.unique_id)
+                all_results.append(RawResult(unique_id=unique_id,
+                                             start_logits=start_logits,
+                                             end_logits=end_logits))
         output_prediction_file = os.path.join(args.output_dir, "predictions.json")
         output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
         write_predictions(eval_examples, eval_features, all_results,

From dbc318a4c605374f6663098ffa8701a626f2b23a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 7 Nov 2018 22:22:55 +0100
Subject: [PATCH 06/28] cleaning up - speeding up a bit multi-gpu

---
 modeling.py       | 2 +-
 run_classifier.py | 7 ++++---
 run_squad.py      | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/modeling.py b/modeling.py
index c467e8266e..860cb939a4 100644
--- a/modeling.py
+++ b/modeling.py
@@ -467,6 +467,6 @@ class BertForQuestionAnswering(nn.Module):
             start_loss = loss_fct(start_logits, start_positions)
             end_loss = loss_fct(end_logits, end_positions)
             total_loss = (start_loss + end_loss) / 2
-            return total_loss, (start_logits, end_logits)
+            return total_loss
         else:
             return start_logits, end_logits
diff --git a/run_classifier.py b/run_classifier.py
index c19c6f9ac0..41c7459bd3 100644
--- a/run_classifier.py
+++ b/run_classifier.py
@@ -514,13 +514,13 @@ def main():
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
         model.train()
-        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
             tr_loss = 0
             nb_tr_examples, nb_tr_steps = 0, 0
             for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                 batch = tuple(t.to(device) for t in batch)
                 input_ids, input_mask, segment_ids, label_ids = batch
-                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
+                loss = model(input_ids, segment_ids, input_mask, label_ids)
                 if n_gpu > 1:
                     loss = loss.mean() # mean() to average on multi-gpu.
                 if args.gradient_accumulation_steps > 1:
@@ -564,7 +564,8 @@ def main():
             segment_ids = segment_ids.to(device)
             label_ids = label_ids.to(device)
 
-            tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
+            with torch.no_grad():
+                tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
 
             logits = logits.detach().cpu().numpy()
             label_ids = label_ids.to('cpu').numpy()
diff --git a/run_squad.py b/run_squad.py
index a25893e1d9..78dff7dea5 100644
--- a/run_squad.py
+++ b/run_squad.py
@@ -855,11 +855,11 @@ def main():
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
         model.train()
-        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
             for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                 batch = tuple(t.to(device) for t in batch)
                 input_ids, input_mask, segment_ids, start_positions, end_positions = batch
-                loss, _ = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
+                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
                 if n_gpu > 1:
                     loss = loss.mean() # mean() to average on multi-gpu.
                 if args.gradient_accumulation_steps > 1:

From d92a7f7721f56e668338dd0c4ec89a1b8e3108f1 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Wed, 7 Nov 2018 23:37:55 +0100
Subject: [PATCH 07/28] Removing note on run_squad.py example

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index efa3c2c283..0316696ec3 100644
--- a/README.md
+++ b/README.md
@@ -194,5 +194,3 @@ python run_squad.py \
   --doc_stride 128 \
   --output_dir ../debug_squad/
 ```
-
-There is currently a bug in the `run_squad.py` script that we are investigating. The reported numbers are very low (F1 of 41.8 and exact match of 21.7) even though the correct answer is usually in the n-best predictions. We are investigating that right now on the develop branch, follow [this issue](https://github.com/huggingface/pytorch-pretrained-BERT/issues/3) for more updates.

From 48d4a5317cf395a522a91186e57745585fc0d817 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 7 Nov 2018 23:51:12 +0100
Subject: [PATCH 08/28] typo fix in output tuple

---
 run_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_classifier.py b/run_classifier.py
index 41c7459bd3..54b7d5a26c 100644
--- a/run_classifier.py
+++ b/run_classifier.py
@@ -520,7 +520,7 @@ def main():
             for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                 batch = tuple(t.to(device) for t in batch)
                 input_ids, input_mask, segment_ids, label_ids = batch
-                loss = model(input_ids, segment_ids, input_mask, label_ids)
+                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                 if n_gpu > 1:
                     loss = loss.mean() # mean() to average on multi-gpu.
                 if args.gradient_accumulation_steps > 1:

From 0ed7696191f4c8cdf2d1fce8771fd4472d36bce8 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Thu, 8 Nov 2018 00:39:42 +0100
Subject: [PATCH 09/28] Updated MRPC results

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0316696ec3..5375ebb27a 100644
--- a/README.md
+++ b/README.md
@@ -166,7 +166,7 @@ python run_classifier.py \
   --output_dir /tmp/mrpc_output/
 ```
 
-Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation results between 82 and 87.
+Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation results between 84% and 88%.
 
 The second example fine-tunes `BERT-Base` on the SQuAD question answering task.
 

From 3bfbc21376af691b912f3b6256bbeaf8e0046ba8 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Thu, 8 Nov 2018 00:44:17 +0100
Subject: [PATCH 10/28] updating pytest command

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5375ebb27a..109600130a 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ A series of tests is included in the [tests folder](https://github.com/huggingfa
 
 You can run the tests with the command:
 ```bash
-pytest -sv ./tests/
+python -m pytest -sv tests/
 ```
 
 ## Training on large batches: gradient accumulation, multi-GPU and distributed training

From 4850ec5888bb3b207e37b2814657c2ad180de391 Mon Sep 17 00:00:00 2001
From: Gopal Krishna <mydigitalanjel@gmail.com>
Date: Fri, 9 Nov 2018 01:30:02 +0530
Subject: [PATCH 11/28] fixed small typos in the README.md (#8)

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 109600130a..41145da8eb 100644
--- a/README.md
+++ b/README.md
@@ -106,13 +106,13 @@ Here is how to use these techniques in our scripts:
 
 - **Gradient Accumulation**: Gradient accumulation can be used by supplying a integer greater than 1 to the `--gradient_accumulation_steps` argument. The batch at each step will be divided by this integer and gradient will be accumulated over `gradient_accumulation_steps` steps.
 - **Multi-GPU**: Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
-- **Distributed training**: Distributed training can be activated by suppying an integer greater or equal to 0 to the `--local_rank` argument. To use Distributed training, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see the above blog post for more details):
+- **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument. To use Distributed training, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see the above blog post for more details):
 
 ```bash
 python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
 ```
 
-Where `$THIS_MACHINE_INDEX` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP adress `192.168.1.1` and an open port `1234`.
+Where `$THIS_MACHINE_INDEX` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address `192.168.1.1` and an open port `1234`.
 
 ## TPU support and pretraining scripts
 
@@ -128,9 +128,9 @@ Since, pre-training BERT is a particularly expensive operation that basically re
 
 We also include [two Jupyter Notebooks](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 
-- The first NoteBook ([Comparing TF and PT models.ipynb](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing%20TF%20and%20PT%20models.ipynb)) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the sandard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
+- The first NoteBook ([Comparing TF and PT models.ipynb](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing%20TF%20and%20PT%20models.ipynb)) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 
-- The second NoteBook ([Comparing TF and PT models SQuAD predictions.ipynb](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing%20TF%20and%20PT%20models%20SQuAD%20predictions.ipynb)) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the `BertForQuestionAnswering` and computes the sandard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
+- The second NoteBook ([Comparing TF and PT models SQuAD predictions.ipynb](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing%20TF%20and%20PT%20models%20SQuAD%20predictions.ipynb)) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the `BertForQuestionAnswering` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
 
 Please follow the instructions given in the notebooks to run and modify them. They can also be nice example on how to use the models in a simpler way than the full fine-tuning scripts we provide.
 
@@ -138,7 +138,7 @@ Please follow the instructions given in the notebooks to run and modify them. Th
 
 We showcase the same examples as [the original implementation](https://github.com/google-research/bert/): fine-tuning a sequence-level classifier on the MRPC classification corpus and a token-level classifier on the question answering dataset SQuAD.
 
-Before running theses examples you should download the
+Before running these examples you should download the
 [GLUE data](https://gluebenchmark.com/tasks) by running
 [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
 and unpack it to some directory `$GLUE_DIR`. Please also download the `BERT-Base`

From 2c5d993ba48841575d9c58f0754bca00b288431c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 8 Nov 2018 21:22:22 +0100
Subject: [PATCH 12/28] update readme - fix SQuAD model on multi-GPU

---
 README.md   | 5 +++++
 modeling.py | 8 +++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 41145da8eb..78e88ab5ea 100644
--- a/README.md
+++ b/README.md
@@ -194,3 +194,8 @@ python run_squad.py \
   --doc_stride 128 \
   --output_dir ../debug_squad/
 ```
+
+Training with the previous hyper-parameters and a batch size 32 (on 4 GPUs) for 2 epochs gave us the following results:
+```bash
+{"f1": 88.19829549714827, "exact_match": 80.75685903500474}
+```
diff --git a/modeling.py b/modeling.py
index 433ee2054c..43db3b30fb 100644
--- a/modeling.py
+++ b/modeling.py
@@ -455,9 +455,11 @@ class BertForQuestionAnswering(nn.Module):
         end_logits = end_logits.squeeze(-1)
 
         if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension - if not this is a no-op
-            start_positions = start_positions.squeeze(-1)
-            end_positions = end_positions.squeeze(-1)
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
             ignored_index = start_logits.size(1)
             start_positions.clamp_(0, ignored_index)

From 0c24db9d5f170b7cd735f6f9ca66b2f433228902 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Fri, 9 Nov 2018 09:11:59 +0100
Subject: [PATCH 13/28] update results for SQuAD

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 78e88ab5ea..45c467e5dd 100644
--- a/README.md
+++ b/README.md
@@ -184,18 +184,19 @@ python run_squad.py \
   --bert_config_file $BERT_BASE_DIR/bert_config.json \
   --init_checkpoint $BERT_PYTORCH_DIR/pytorch_model.bin \
   --do_train \
-  --train_file $SQUAD_DIR/train-v1.1.json \
   --do_predict \
+  --do_lower_case
+  --train_file $SQUAD_DIR/train-v1.1.json \
   --predict_file $SQUAD_DIR/dev-v1.1.json \
   --train_batch_size 12 \
-  --learning_rate 5e-5 \
+  --learning_rate 3e-5 \
   --num_train_epochs 2.0 \
   --max_seq_length 384 \
   --doc_stride 128 \
   --output_dir ../debug_squad/
 ```
 
-Training with the previous hyper-parameters and a batch size 32 (on 4 GPUs) for 2 epochs gave us the following results:
+Training with the previous hyper-parameters gave us the following results:
 ```bash
-{"f1": 88.19829549714827, "exact_match": 80.75685903500474}
+{"f1": 88.52381567990474, "exact_match": 81.22043519394512}
 ```

From 34bdc8b54fc6cd9d3877df43e23356cfd79cdfe9 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 9 Nov 2018 09:19:45 +0100
Subject: [PATCH 14/28] remove duplicate accumulate gradient step arguments

---
 run_classifier.py | 12 ++++--------
 run_squad.py      | 12 ++++--------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/run_classifier.py b/run_classifier.py
index 54b7d5a26c..ab5251b1c0 100644
--- a/run_classifier.py
+++ b/run_classifier.py
@@ -392,10 +392,6 @@ def main():
                         default=False,
                         action='store_true',
                         help="Whether not to use CUDA when available")
-    parser.add_argument("--accumulate_gradients",
-                        type=int,
-                        default=1,
-                        help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
     parser.add_argument("--local_rank",
                         type=int,
                         default=-1,
@@ -426,11 +422,11 @@ def main():
         torch.distributed.init_process_group(backend='nccl')
     logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
 
-    if args.accumulate_gradients < 1:
-        raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
-                            args.accumulate_gradients))
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
 
-    args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients)
+    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
 
     random.seed(args.seed)
     np.random.seed(args.seed)
diff --git a/run_squad.py b/run_squad.py
index 78dff7dea5..59bd32c7c6 100644
--- a/run_squad.py
+++ b/run_squad.py
@@ -731,10 +731,6 @@ def main():
                         type=int,
                         default=-1,
                         help="local_rank for distributed training on gpus")
-    parser.add_argument("--accumulate_gradients",
-                        type=int,
-                        default=1,
-                        help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
     parser.add_argument('--seed', 
                         type=int, 
                         default=42,
@@ -756,11 +752,11 @@ def main():
         torch.distributed.init_process_group(backend='nccl')
     logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
 
-    if args.accumulate_gradients < 1:
-        raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
-                            args.accumulate_gradients))
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
 
-    args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients)
+    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
 
     random.seed(args.seed)
     np.random.seed(args.seed)

From 34a1a01091e5411bd7f4379a0d9c33d2b03c7be1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 9 Nov 2018 09:31:20 +0100
Subject: [PATCH 15/28] update code comment

---
 modeling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modeling.py b/modeling.py
index 43db3b30fb..9c6fa38e05 100644
--- a/modeling.py
+++ b/modeling.py
@@ -337,8 +337,8 @@ class BertModel(nn.Module):
             token_type_ids = torch.zeros_like(input_ids)
 
         # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, from_seq_length]
-        # So we can broadcast to [batch_size, num_heads, to_seq_length, from_seq_length]
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
         # this attention mask is more simple than the triangular masking of causal attention
         # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
         extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

From 9e95cd8cd610323327a981f8fd24a19677cccf22 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 9 Nov 2018 11:23:55 +0100
Subject: [PATCH 16/28] clean up optimizer from unused functions

---
 optimization.py | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/optimization.py b/optimization.py
index e04f01b1d2..bce0a9bf2a 100644
--- a/optimization.py
+++ b/optimization.py
@@ -90,27 +90,6 @@ class BERTAdam(Optimizer):
                 lr.append(lr_scheduled)
         return lr
 
-    def to(self, device):
-        """ Move the optimizer state to a specified device"""
-        for state in self.state.values():
-            state['exp_avg'].to(device)
-            state['exp_avg_sq'].to(device)
-
-    def initialize_step(self, initial_step):
-        """Initialize state with a defined step (but we don't have stored averaged).
-        Arguments:
-            initial_step (int): Initial step number.
-        """
-        for group in self.param_groups:
-            for p in group['params']:
-                state = self.state[p]
-                # State initialization
-                state['step'] = initial_step
-                # Exponential moving average of gradient values
-                state['exp_avg'] = torch.zeros_like(p.data)
-                # Exponential moving average of squared gradient values
-                state['exp_avg_sq'] = torch.zeros_like(p.data)
-
     def step(self, closure=None):
         """Performs a single optimization step.
 

From 5f04aa00edd7fe08d4beae28d831b5c556b0c406 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 9 Nov 2018 11:28:14 +0100
Subject: [PATCH 17/28] option to perform optimization and keep the optimizer
 averages on CPU

---
 run_squad.py | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/run_squad.py b/run_squad.py
index 59bd32c7c6..e44044f9a0 100644
--- a/run_squad.py
+++ b/run_squad.py
@@ -719,7 +719,6 @@ def main():
     parser.add_argument("--max_answer_length", default=30, type=int,
                         help="The maximum length of an answer that can be generated. This is needed because the start "
                              "and end predictions are not conditioned on one another.")
-
     parser.add_argument("--verbose_logging", default=False, action='store_true',
                         help="If true, all of the warnings related to data processing will be printed. "
                              "A number of warnings are expected for a normal SQuAD evaluation.")
@@ -727,10 +726,6 @@ def main():
                         default=False,
                         action='store_true',
                         help="Whether not to use CUDA when available")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
     parser.add_argument('--seed', 
                         type=int, 
                         default=42,
@@ -738,7 +733,16 @@ def main():
     parser.add_argument('--gradient_accumulation_steps',
                         type=int,
                         default=1,
-                        help="Number of updates steps to accumualte before performing a backward/update pass.")
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--optimize_on_cpu',
+                        default=False,
+                        action='store_true',
+                        help="Whether to perform optimization and keep the optimizer averages on CPU")
+
 
     args = parser.parse_args()
 
@@ -802,25 +806,26 @@ def main():
     model = BertForQuestionAnswering(bert_config)
     if args.init_checkpoint is not None:
         model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
-    model.to(device)
-
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
 
+    if not args.optimize_on_cpu:
+        model.to(device)
     no_decay = ['bias', 'gamma', 'beta']
     optimizer_parameters = [
         {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
         {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
         ]
-
     optimizer = BERTAdam(optimizer_parameters,
                          lr=args.learning_rate,
                          warmup=args.warmup_proportion,
                          t_total=num_train_steps)
 
+    model.to(device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
     global_step = 0
     if args.do_train:
         train_features = convert_examples_to_features(
@@ -862,8 +867,12 @@ def main():
                     loss = loss / args.gradient_accumulation_steps
                 loss.backward()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.optimize_on_cpu:
+                        model.to('cpu')
                     optimizer.step()    # We have accumulated enought gradients
                     model.zero_grad()
+                    if args.optimize_on_cpu:
+                        model.to(device)
                     global_step += 1
 
     if args.do_predict:

From ea85cca8abc4ed11bded457630fca83130b4c618 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 9 Nov 2018 11:42:37 +0100
Subject: [PATCH 18/28] adding optimize_on_cpu explanation in readme

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 45c467e5dd..cd8d04a240 100644
--- a/README.md
+++ b/README.md
@@ -100,10 +100,11 @@ python -m pytest -sv tests/
 
 BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32).
 
-To help with fine-tuning these models, we have included three techniques that you can activate in the fine-tuning scripts `run_classifier.py` and `run_squad.py`: gradient-accumulation, multi-gpu and distributed training. For more details on how to use these techniques you can read [the tips on training large batches in PyTorch](https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255) that I published earlier this month.
+To help with fine-tuning these models, we have included four techniques that you can activate in the fine-tuning scripts `run_classifier.py` and `run_squad.py`: optimize on CPU, gradient-accumulation, multi-gpu and distributed training. For more details on how to use these techniques you can read [the tips on training large batches in PyTorch](https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255) that I published earlier this month.
 
 Here is how to use these techniques in our scripts:
 
+- **Optimize on CPU**: The Adam optimizer comprise 2 moving average of all the weights of the model which means that if you keep them on GPU 1 (typical behavior), your first GPU will have to store 3-times the size of the model. This is not optimal when using a large model like `BERT-large` and means your batch size is a lot lower than it could be. This option will perform the optimization and store the averages on the CPU to free more room on the GPU(s). As the most computational intensive operation is the backward pass, this usually doesn't increase the computation time by a lot. This is the only way to fine-tune `BERT-large` in a reasonable time on GPU(s) (see below). Activate this option with `--optimize_on_cpu` on the `run_squad.py` script.
 - **Gradient Accumulation**: Gradient accumulation can be used by supplying a integer greater than 1 to the `--gradient_accumulation_steps` argument. The batch at each step will be divided by this integer and gradient will be accumulated over `gradient_accumulation_steps` steps.
 - **Multi-GPU**: Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
 - **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument. To use Distributed training, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see the above blog post for more details):

From a81a1ef8e9e839c9c50bdc5fae69afbeffb46036 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 10 Nov 2018 16:11:14 +0100
Subject: [PATCH 19/28] fixing learning rate schedule when using
 gradient_accumulation_steps

---
 run_classifier.py |  2 +-
 run_squad.py      | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/run_classifier.py b/run_classifier.py
index ab5251b1c0..b9aafce645 100644
--- a/run_classifier.py
+++ b/run_classifier.py
@@ -464,7 +464,7 @@ def main():
     if args.do_train:
         train_examples = processor.get_train_examples(args.data_dir)
         num_train_steps = int(
-            len(train_examples) / args.train_batch_size * args.num_train_epochs)
+            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
 
     model = BertForSequenceClassification(bert_config, len(label_list))
     if args.init_checkpoint is not None:
diff --git a/run_squad.py b/run_squad.py
index e44044f9a0..9a9fbb61d5 100644
--- a/run_squad.py
+++ b/run_squad.py
@@ -742,6 +742,10 @@ def main():
                         default=False,
                         action='store_true',
                         help="Whether to perform optimization and keep the optimizer averages on CPU")
+    parser.add_argument('--fp16',
+                        default=False,
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
 
 
     args = parser.parse_args()
@@ -801,11 +805,13 @@ def main():
         train_examples = read_squad_examples(
             input_file=args.train_file, is_training=True)
         num_train_steps = int(
-            len(train_examples) / args.train_batch_size * args.num_train_epochs)
+            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
 
     model = BertForQuestionAnswering(bert_config)
     if args.init_checkpoint is not None:
         model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
+    if args.fp16:
+        model.half()
 
     if not args.optimize_on_cpu:
         model.to(device)
@@ -847,6 +853,12 @@ def main():
         all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
         all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
 
+        if args.fp16:
+            (all_input_ids, all_input_mask,
+             all_segment_ids, all_start_positions,
+             all_end_positions) = tuple(t.half() for t in (all_input_ids, all_input_mask, all_segment_ids,
+                                                           all_start_positions, all_end_positions))
+
         train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                    all_start_positions, all_end_positions)
         if args.local_rank == -1:
@@ -895,6 +907,10 @@ def main():
         all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
         all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
         all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+        if args.fp16:
+            (all_input_ids, all_input_mask,
+             all_segment_ids, all_example_index) = tuple(t.half() for t in (all_input_ids, all_input_mask,
+                                                                            all_segment_ids, all_example_index))
 
         eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
         if args.local_rank == -1:

From c4bfc646f5cc11c5a6dfe7166e58a36e917fd5b1 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Sun, 11 Nov 2018 16:59:35 +0100
Subject: [PATCH 20/28] Add results of fine-tuning BERT-large on GPUs

---
 README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/README.md b/README.md
index cd8d04a240..4eb31a0ece 100644
--- a/README.md
+++ b/README.md
@@ -201,3 +201,21 @@ Training with the previous hyper-parameters gave us the following results:
 ```bash
 {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
 ```
+
+# Fine-tuning BERT-large on GPUs
+
+The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.
+
+For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results:
+```bash
+{"exact_match": 84.56953642384106, "f1": 91.04028647786927}
+```
+To get these results that we used a combination of:
+- multi-GPU training (automatically activated on a multi-GPU server),
+- 2 steps of gradient accumulation and
+- perform the optimization step on CPU to store Adam's averages in RAM.
+
+Here are the full list of hyper-parameters we used for this run:
+```bash
+python ./run_squad.py --vocab_file $BERT_LARGE_DIR/vocab.txt --bert_config_file $BERT_LARGE_DIR/bert_config.json --init_checkpoint $BERT_LARGE_DIR/pytorch_model.bin --do_lower_case --do_train --do_predict --train_file $SQUAD_TRAIN --predict_file $SQUAD_EVAL --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 384 --doc_stride 128 --output_dir $OUTPUT_DIR/bert_large_bsz_24 --train_batch_size 24 --gradient_accumulation_steps 2 --optimize_on_cpu
+```

From 6d6b916f48c10483f6b0f07263568e79c8797c9e Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Sun, 11 Nov 2018 17:00:49 +0100
Subject: [PATCH 21/28] update to BERT-large results

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4eb31a0ece..17fb2f1f70 100644
--- a/README.md
+++ b/README.md
@@ -206,7 +206,7 @@ Training with the previous hyper-parameters gave us the following results:
 
 The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.
 
-For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results:
+For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results (actually slightly higher):
 ```bash
 {"exact_match": 84.56953642384106, "f1": 91.04028647786927}
 ```

From fa1aa81f2623cbb3243ba67b0fb5a862851d8eda Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Mon, 12 Nov 2018 08:37:43 +0100
Subject: [PATCH 22/28] fix typo in readme bach examples

---
 README.md | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 17fb2f1f70..ff0bdffb4c 100644
--- a/README.md
+++ b/README.md
@@ -186,7 +186,7 @@ python run_squad.py \
   --init_checkpoint $BERT_PYTORCH_DIR/pytorch_model.bin \
   --do_train \
   --do_predict \
-  --do_lower_case
+  --do_lower_case \
   --train_file $SQUAD_DIR/train-v1.1.json \
   --predict_file $SQUAD_DIR/dev-v1.1.json \
   --train_batch_size 12 \
@@ -217,5 +217,21 @@ To get these results that we used a combination of:
 
 Here are the full list of hyper-parameters we used for this run:
 ```bash
-python ./run_squad.py --vocab_file $BERT_LARGE_DIR/vocab.txt --bert_config_file $BERT_LARGE_DIR/bert_config.json --init_checkpoint $BERT_LARGE_DIR/pytorch_model.bin --do_lower_case --do_train --do_predict --train_file $SQUAD_TRAIN --predict_file $SQUAD_EVAL --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 384 --doc_stride 128 --output_dir $OUTPUT_DIR/bert_large_bsz_24 --train_batch_size 24 --gradient_accumulation_steps 2 --optimize_on_cpu
+python ./run_squad.py \
+  --vocab_file $BERT_LARGE_DIR/vocab.txt \
+  --bert_config_file $BERT_LARGE_DIR/bert_config.json \
+  --init_checkpoint $BERT_LARGE_DIR/pytorch_model.bin \
+  --do_lower_case \
+  --do_train \
+  --do_predict \
+  --train_file $SQUAD_TRAIN \
+  --predict_file $SQUAD_EVAL \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir $OUTPUT_DIR \
+  --train_batch_size 24 \
+  --gradient_accumulation_steps 2 \
+  --optimize_on_cpu
 ```

From 5dfd19060a7ab961080fa8360ed6ab7ec6c88834 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Mon, 12 Nov 2018 12:39:57 +0100
Subject: [PATCH 23/28] fix typo in readme

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ff0bdffb4c..5265ec17a9 100644
--- a/README.md
+++ b/README.md
@@ -210,12 +210,12 @@ For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80
 ```bash
 {"exact_match": 84.56953642384106, "f1": 91.04028647786927}
 ```
-To get these results that we used a combination of:
+To get these results we used a combination of:
 - multi-GPU training (automatically activated on a multi-GPU server),
 - 2 steps of gradient accumulation and
 - perform the optimization step on CPU to store Adam's averages in RAM.
 
-Here are the full list of hyper-parameters we used for this run:
+Here are the full list of hyper-parameters for this run:
 ```bash
 python ./run_squad.py \
   --vocab_file $BERT_LARGE_DIR/vocab.txt \

From 66b0090877db3f9b65f24b21400f1e29a23f72d6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 12 Nov 2018 15:15:02 +0100
Subject: [PATCH 24/28] add fp16 training

---
 README.md         |  11 ++---
 modeling.py       |   2 +-
 run_classifier.py |  88 +++++++++++++++++++++++++++++++------
 run_squad.py      | 108 +++++++++++++++++++++++++++++++---------------
 4 files changed, 157 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index 5265ec17a9..525ad979f7 100644
--- a/README.md
+++ b/README.md
@@ -100,19 +100,20 @@ python -m pytest -sv tests/
 
 BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32).
 
-To help with fine-tuning these models, we have included four techniques that you can activate in the fine-tuning scripts `run_classifier.py` and `run_squad.py`: optimize on CPU, gradient-accumulation, multi-gpu and distributed training. For more details on how to use these techniques you can read [the tips on training large batches in PyTorch](https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255) that I published earlier this month.
+To help with fine-tuning these models, we have included five techniques that you can activate in the fine-tuning scripts `run_classifier.py` and `run_squad.py`: gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training . For more details on how to use these techniques you can read [the tips on training large batches in PyTorch](https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255) that I published earlier this month.
 
 Here is how to use these techniques in our scripts:
 
-- **Optimize on CPU**: The Adam optimizer comprise 2 moving average of all the weights of the model which means that if you keep them on GPU 1 (typical behavior), your first GPU will have to store 3-times the size of the model. This is not optimal when using a large model like `BERT-large` and means your batch size is a lot lower than it could be. This option will perform the optimization and store the averages on the CPU to free more room on the GPU(s). As the most computational intensive operation is the backward pass, this usually doesn't increase the computation time by a lot. This is the only way to fine-tune `BERT-large` in a reasonable time on GPU(s) (see below). Activate this option with `--optimize_on_cpu` on the `run_squad.py` script.
 - **Gradient Accumulation**: Gradient accumulation can be used by supplying a integer greater than 1 to the `--gradient_accumulation_steps` argument. The batch at each step will be divided by this integer and gradient will be accumulated over `gradient_accumulation_steps` steps.
 - **Multi-GPU**: Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
-- **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument. To use Distributed training, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see the above blog post for more details):
+- **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument.
+- **Optimize on CPU**: The Adam optimizer comprise 2 moving average of all the weights of the model which means that if you keep them on GPU 1 (typical behavior), your first GPU will have to store 3-times the size of the model. This is not optimal when using a large model like `BERT-large` and means your batch size is a lot lower than it could be. This option will perform the optimization and store the averages on the CPU to free more room on the GPU(s). As the most computational intensive operation is the backward pass, this usually doesn't increase the computation time by a lot. This is the only way to fine-tune `BERT-large` in a reasonable time on GPU(s) (see below). Activate this option with `--optimize_on_cpu` on the `run_squad.py` script.
+- **16-bits training**: 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by a factor of 2, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. 16bits training natively incoporate the behavior of `--optimize_on_gpu` so it's not needed to have the two flags at the same time. A good introduction to Mixed precision training can be found [here](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) and a full documentation is [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). In our scripts, this option can be activated by setting the `--fp16` flag and you can play with loss scaling using the `--loss_scaling` flag (see the previously linked documentation for details on loss scaling). If the loss scaling is too high (`Nan` in the gradients) it will be automatically scaled down until the value is acceptable. The default loss scaling is 128 which behaved nicely in our tests.
 
+Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details):
 ```bash
 python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
 ```
-
 Where `$THIS_MACHINE_INDEX` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address `192.168.1.1` and an open port `1234`.
 
 ## TPU support and pretraining scripts
@@ -215,7 +216,7 @@ To get these results we used a combination of:
 - 2 steps of gradient accumulation and
 - perform the optimization step on CPU to store Adam's averages in RAM.
 
-Here are the full list of hyper-parameters for this run:
+Here is the full list of hyper-parameters for this run:
 ```bash
 python ./run_squad.py \
   --vocab_file $BERT_LARGE_DIR/vocab.txt \
diff --git a/modeling.py b/modeling.py
index 9c6fa38e05..66b0de68d9 100644
--- a/modeling.py
+++ b/modeling.py
@@ -348,7 +348,7 @@ class BertModel(nn.Module):
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.float()
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
         embedding_output = self.embeddings(input_ids, token_type_ids)
diff --git a/run_classifier.py b/run_classifier.py
index b9aafce645..2f58382ede 100644
--- a/run_classifier.py
+++ b/run_classifier.py
@@ -309,6 +309,32 @@ def accuracy(out, labels):
     outputs = np.argmax(out, axis=1)
     return np.sum(outputs==labels)
 
+def copy_optimizer_params_to_model(named_params_model, named_params_optimizer):
+    """ Utility function for optimize_on_cpu and 16-bits training.
+        Copy the parameters optimized on CPU/RAM back to the model on GPU
+    """
+    for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
+        if name_opti != name_model:
+            logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
+            raise ValueError
+        param_model.data.copy_(param_opti.data)
+
+def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False):
+    """ Utility function for optimize_on_cpu and 16-bits training.
+        Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
+    """
+    is_nan = False
+    for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
+        if name_opti != name_model:
+            logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
+            raise ValueError
+        if test_nan and torch.isnan(param_model.grad).sum() > 0:
+            is_nan = True
+        if param_opti.grad is None:
+            param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size()))
+        param_opti.grad.data.copy_(param_model.grad.data)
+    return is_nan
+
 def main():
     parser = argparse.ArgumentParser()
 
@@ -404,6 +430,18 @@ def main():
                         type=int,
                         default=1,
                         help="Number of updates steps to accumualte before performing a backward/update pass.")                       
+    parser.add_argument('--optimize_on_cpu',
+                        default=False,
+                        action='store_true',
+                        help="Whether to perform optimization and keep the optimizer averages on CPU")
+    parser.add_argument('--fp16',
+                        default=False,
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--loss_scale',
+                        type=float, default=128,
+                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
+
     args = parser.parse_args()
 
     processors = {
@@ -420,6 +458,9 @@ def main():
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')
+        if args.fp16:
+            logger.info("16-bits training currently not supported in distributed training")
+            args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
     logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
 
     if args.gradient_accumulation_steps < 1:
@@ -466,24 +507,34 @@ def main():
         num_train_steps = int(
             len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
 
+    # Prepare model
     model = BertForSequenceClassification(bert_config, len(label_list))
     if args.init_checkpoint is not None:
         model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
+    if args.fp16:
+        model.half()
     model.to(device)
-
     if args.local_rank != -1:
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                           output_device=args.local_rank)
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
+    # Prepare optimizer
+    if args.fp16:
+        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
+                            for n, param in model.named_parameters()]
+    elif args.optimize_on_cpu:
+        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
+                            for n, param in model.named_parameters()]
+    else:
+        param_optimizer = list(model.named_parameters())
     no_decay = ['bias', 'gamma', 'beta']
-    optimizer_parameters = [
-        {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
-        {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01},
+        {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0}
         ]
-
-    optimizer = BERTAdam(optimizer_parameters,
+    optimizer = BERTAdam(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          warmup=args.warmup_proportion,
                          t_total=num_train_steps)
@@ -496,12 +547,10 @@ def main():
         logger.info("  Num examples = %d", len(train_examples))
         logger.info("  Batch size = %d", args.train_batch_size)
         logger.info("  Num steps = %d", num_train_steps)
-
         all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
         all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
         all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
         all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-
         train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
         if args.local_rank == -1:
             train_sampler = RandomSampler(train_data)
@@ -519,6 +568,10 @@ def main():
                 loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                 if n_gpu > 1:
                     loss = loss.mean() # mean() to average on multi-gpu.
+                if args.fp16 and args.loss_scale != 1.0:
+                    # rescale loss for fp16 training
+                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
+                    loss = loss * args.loss_scale
                 if args.gradient_accumulation_steps > 1:
                     loss = loss / args.gradient_accumulation_steps
                 loss.backward()
@@ -526,7 +579,21 @@ def main():
                 nb_tr_examples += input_ids.size(0)
                 nb_tr_steps += 1
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    optimizer.step()    # We have accumulated enought gradients
+                    if args.fp16 or args.optimize_on_cpu:
+                        if args.fp16 and args.loss_scale != 1.0:
+                            # scale down gradients for fp16 training
+                            for param in model.parameters():
+                                param.grad.data = param.grad.data / args.loss_scale
+                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
+                        if is_nan:
+                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
+                            args.loss_scale = args.loss_scale / 2
+                            model.zero_grad()
+                            continue
+                        optimizer.step()
+                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
+                    else:
+                        optimizer.step()
                     model.zero_grad()
                     global_step += 1
 
@@ -534,16 +601,13 @@ def main():
         eval_examples = processor.get_dev_examples(args.data_dir)
         eval_features = convert_examples_to_features(
             eval_examples, label_list, args.max_seq_length, tokenizer)
-
         logger.info("***** Running evaluation *****")
         logger.info("  Num examples = %d", len(eval_examples))
         logger.info("  Batch size = %d", args.eval_batch_size)
-
         all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
         all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
         all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
         all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-
         eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
         if args.local_rank == -1:
             eval_sampler = SequentialSampler(eval_data)
diff --git a/run_squad.py b/run_squad.py
index 9a9fbb61d5..248b92c504 100644
--- a/run_squad.py
+++ b/run_squad.py
@@ -669,6 +669,31 @@ def _compute_softmax(scores):
         probs.append(score / total_sum)
     return probs
 
+def copy_optimizer_params_to_model(named_params_model, named_params_optimizer):
+    """ Utility function for optimize_on_cpu and 16-bits training.
+        Copy the parameters optimized on CPU/RAM back to the model on GPU
+    """
+    for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
+        if name_opti != name_model:
+            logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
+            raise ValueError
+        param_model.data.copy_(param_opti.data)
+
+def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False):
+    """ Utility function for optimize_on_cpu and 16-bits training.
+        Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
+    """
+    is_nan = False
+    for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
+        if name_opti != name_model:
+            logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
+            raise ValueError
+        if test_nan and torch.isnan(param_model.grad).sum() > 0:
+            is_nan = True
+        if param_opti.grad is None:
+            param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size()))
+        param_opti.grad.data.copy_(param_model.grad.data)
+    return is_nan
 
 def main():
     parser = argparse.ArgumentParser()
@@ -746,7 +771,9 @@ def main():
                         default=False,
                         action='store_true',
                         help="Whether to use 16-bit float precision instead of 32-bit")
-
+    parser.add_argument('--loss_scale',
+                        type=float, default=128,
+                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
 
     args = parser.parse_args()
 
@@ -758,7 +785,11 @@ def main():
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')
-    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
+        if args.fp16:
+            logger.info("16-bits training currently not supported in distributed training")
+            args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
 
     if args.gradient_accumulation_steps < 1:
         raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
@@ -807,24 +838,12 @@ def main():
         num_train_steps = int(
             len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
 
+    # Prepare model
     model = BertForQuestionAnswering(bert_config)
     if args.init_checkpoint is not None:
         model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
     if args.fp16:
         model.half()
-
-    if not args.optimize_on_cpu:
-        model.to(device)
-    no_decay = ['bias', 'gamma', 'beta']
-    optimizer_parameters = [
-        {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
-        {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
-        ]
-    optimizer = BERTAdam(optimizer_parameters,
-                         lr=args.learning_rate,
-                         warmup=args.warmup_proportion,
-                         t_total=num_train_steps)
-
     model.to(device)
     if args.local_rank != -1:
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
@@ -832,6 +851,25 @@ def main():
     elif n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
+    # Prepare optimizer
+    if args.fp16:
+        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
+                            for n, param in model.named_parameters()]
+    elif args.optimize_on_cpu:
+        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
+                            for n, param in model.named_parameters()]
+    else:
+        param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'gamma', 'beta']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01},
+        {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0}
+        ]
+    optimizer = BERTAdam(optimizer_grouped_parameters,
+                         lr=args.learning_rate,
+                         warmup=args.warmup_proportion,
+                         t_total=num_train_steps)
+
     global_step = 0
     if args.do_train:
         train_features = convert_examples_to_features(
@@ -846,19 +884,11 @@ def main():
         logger.info("  Num split examples = %d", len(train_features))
         logger.info("  Batch size = %d", args.train_batch_size)
         logger.info("  Num steps = %d", num_train_steps)
-
         all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
         all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
         all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
         all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
         all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
-
-        if args.fp16:
-            (all_input_ids, all_input_mask,
-             all_segment_ids, all_start_positions,
-             all_end_positions) = tuple(t.half() for t in (all_input_ids, all_input_mask, all_segment_ids,
-                                                           all_start_positions, all_end_positions))
-
         train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                    all_start_positions, all_end_positions)
         if args.local_rank == -1:
@@ -870,21 +900,36 @@ def main():
         model.train()
         for _ in trange(int(args.num_train_epochs), desc="Epoch"):
             for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
-                batch = tuple(t.to(device) for t in batch)
+                if n_gpu == 1:
+                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
                 input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                 loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
                 if n_gpu > 1:
                     loss = loss.mean() # mean() to average on multi-gpu.
+                if args.fp16 and args.loss_scale != 1.0:
+                    # rescale loss for fp16 training
+                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
+                    loss = loss * args.loss_scale
                 if args.gradient_accumulation_steps > 1:
                     loss = loss / args.gradient_accumulation_steps
                 loss.backward()
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.optimize_on_cpu:
-                        model.to('cpu')
-                    optimizer.step()    # We have accumulated enought gradients
+                    if args.fp16 or args.optimize_on_cpu:
+                        if args.fp16 and args.loss_scale != 1.0:
+                            # scale down gradients for fp16 training
+                            for param in model.parameters():
+                                param.grad.data = param.grad.data / args.loss_scale
+                        is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
+                        if is_nan:
+                            logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
+                            args.loss_scale = args.loss_scale / 2
+                            model.zero_grad()
+                            continue
+                        optimizer.step()
+                        copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
+                    else:
+                        optimizer.step()
                     model.zero_grad()
-                    if args.optimize_on_cpu:
-                        model.to(device)
                     global_step += 1
 
     if args.do_predict:
@@ -907,11 +952,6 @@ def main():
         all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
         all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
         all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        if args.fp16:
-            (all_input_ids, all_input_mask,
-             all_segment_ids, all_example_index) = tuple(t.half() for t in (all_input_ids, all_input_mask,
-                                                                            all_segment_ids, all_example_index))
-
         eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
         if args.local_rank == -1:
             eval_sampler = SequentialSampler(eval_data)

From 1cf0a16c6720b4c8b7bb0843c7cdd4c2d5744b4f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 12 Nov 2018 15:24:47 +0100
Subject: [PATCH 25/28] cleaning up readme

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 525ad979f7..d9d7273140 100644
--- a/README.md
+++ b/README.md
@@ -106,9 +106,9 @@ Here is how to use these techniques in our scripts:
 
 - **Gradient Accumulation**: Gradient accumulation can be used by supplying a integer greater than 1 to the `--gradient_accumulation_steps` argument. The batch at each step will be divided by this integer and gradient will be accumulated over `gradient_accumulation_steps` steps.
 - **Multi-GPU**: Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
-- **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument.
-- **Optimize on CPU**: The Adam optimizer comprise 2 moving average of all the weights of the model which means that if you keep them on GPU 1 (typical behavior), your first GPU will have to store 3-times the size of the model. This is not optimal when using a large model like `BERT-large` and means your batch size is a lot lower than it could be. This option will perform the optimization and store the averages on the CPU to free more room on the GPU(s). As the most computational intensive operation is the backward pass, this usually doesn't increase the computation time by a lot. This is the only way to fine-tune `BERT-large` in a reasonable time on GPU(s) (see below). Activate this option with `--optimize_on_cpu` on the `run_squad.py` script.
-- **16-bits training**: 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by a factor of 2, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. 16bits training natively incoporate the behavior of `--optimize_on_gpu` so it's not needed to have the two flags at the same time. A good introduction to Mixed precision training can be found [here](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) and a full documentation is [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). In our scripts, this option can be activated by setting the `--fp16` flag and you can play with loss scaling using the `--loss_scaling` flag (see the previously linked documentation for details on loss scaling). If the loss scaling is too high (`Nan` in the gradients) it will be automatically scaled down until the value is acceptable. The default loss scaling is 128 which behaved nicely in our tests.
+- **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument (see below).
+- **Optimize on CPU**: The Adam optimizer stores 2 moving average of the weights of the model. If you keep them on GPU 1 (typical behavior), your first GPU will have to store 3-times the size of the model. This is not optimal for large models like `BERT-large` and means your batch size is a lot lower than it could be. This option will perform the optimization and store the averages on the CPU/RAM to free more room on the GPU(s). As the most computational intensive operation is usually the backward pass, this doesn't impact too much the training time. Activate this option with `--optimize_on_cpu` on the `run_squad.py` script.
+- **16-bits training**: 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found [here](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) and a full documentation is [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). In our scripts, this option can be activated by setting the `--fp16` flag and you can play with loss scaling using the `--loss_scaling` flag (see the previously linked documentation for details on loss scaling). If the loss scaling is too high (`Nan` in the gradients) it will be automatically scaled down until the value is acceptable. The default loss scaling is 128 which behaved nicely in our tests.
 
 Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details):
 ```bash

From d940eeda5457d7447bdb045f9f0663f651bf1497 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 12 Nov 2018 15:26:46 +0100
Subject: [PATCH 26/28] typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d9d7273140..c5b56a869d 100644
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ Here is how to use these techniques in our scripts:
 - **Gradient Accumulation**: Gradient accumulation can be used by supplying a integer greater than 1 to the `--gradient_accumulation_steps` argument. The batch at each step will be divided by this integer and gradient will be accumulated over `gradient_accumulation_steps` steps.
 - **Multi-GPU**: Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
 - **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument (see below).
-- **Optimize on CPU**: The Adam optimizer stores 2 moving average of the weights of the model. If you keep them on GPU 1 (typical behavior), your first GPU will have to store 3-times the size of the model. This is not optimal for large models like `BERT-large` and means your batch size is a lot lower than it could be. This option will perform the optimization and store the averages on the CPU/RAM to free more room on the GPU(s). As the most computational intensive operation is usually the backward pass, this doesn't impact too much the training time. Activate this option with `--optimize_on_cpu` on the `run_squad.py` script.
+- **Optimize on CPU**: The Adam optimizer stores 2 moving average of the weights of the model. If you keep them on GPU 1 (typical behavior), your first GPU will have to store 3-times the size of the model. This is not optimal for large models like `BERT-large` and means your batch size is a lot lower than it could be. This option will perform the optimization and store the averages on the CPU/RAM to free more room on the GPU(s). As the most computational intensive operation is usually the backward pass, this doesn't have a significant impact on the training time. Activate this option with `--optimize_on_cpu` on the `run_squad.py` script.
 - **16-bits training**: 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found [here](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) and a full documentation is [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). In our scripts, this option can be activated by setting the `--fp16` flag and you can play with loss scaling using the `--loss_scaling` flag (see the previously linked documentation for details on loss scaling). If the loss scaling is too high (`Nan` in the gradients) it will be automatically scaled down until the value is acceptable. The default loss scaling is 128 which behaved nicely in our tests.
 
 Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details):

From 278fd28a32e50a31d7f9fb20c120618cc6370ee6 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 13 Nov 2018 09:34:49 +0100
Subject: [PATCH 27/28] added results for 16-bit fine-tuning in readme

---
 README.md | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/README.md b/README.md
index c5b56a869d..cd4c7c97dd 100644
--- a/README.md
+++ b/README.md
@@ -236,3 +236,31 @@ python ./run_squad.py \
   --gradient_accumulation_steps 2 \
   --optimize_on_cpu
 ```
+
+If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16).
+
+Here is an example of hyper-parameters for a FP16 run we tried:
+```bash
+python ./run_squad.py \
+  --vocab_file $BERT_LARGE_DIR/vocab.txt \
+  --bert_config_file $BERT_LARGE_DIR/bert_config.json \
+  --init_checkpoint $BERT_LARGE_DIR/pytorch_model.bin \
+  --do_lower_case \
+  --do_train \
+  --do_predict \
+  --train_file $SQUAD_TRAIN \
+  --predict_file $SQUAD_EVAL \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir $OUTPUT_DIR \
+  --train_batch_size 24 \
+  --fp16 \
+  --loss_scale 128
+```
+
+The results were similar to the above FP32 results (actually slightly higher):
+```bash
+{"exact_match": 84.65468306527909, "f1": 91.238669287002}
+```

From 20d07b3a7f8a1f04aa94e3a7f2ec03fad641de70 Mon Sep 17 00:00:00 2001
From: Donatas Repecka <donatasrep@gmail.com>
Date: Tue, 13 Nov 2018 16:56:25 +0200
Subject: [PATCH 28/28] Excluding AdamWeightDecayOptimizer internal variables
 from restoring

---
 convert_tf_checkpoint_to_pytorch.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 convert_tf_checkpoint_to_pytorch.py

diff --git a/convert_tf_checkpoint_to_pytorch.py b/convert_tf_checkpoint_to_pytorch.py
old mode 100644
new mode 100755
index dfcdbee42d..eeebb3728e
--- a/convert_tf_checkpoint_to_pytorch.py
+++ b/convert_tf_checkpoint_to_pytorch.py
@@ -68,11 +68,17 @@ def convert():
         arrays.append(array)
 
     for name, array in zip(names, arrays):
-        name = name[5:]  # skip "bert/"
+        if not name.startswith("bert"):
+            print("Skipping {}".format(name))
+            continue
+        else:
+            name = name.replace("bert/", "")  # skip "bert/"
         print("Loading {}".format(name))
         name = name.split('/')
-        if name[0] in ['redictions', 'eq_relationship']:
-            print("Skipping")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if name[0] in ['redictions', 'eq_relationship'] or name[-1] == "adam_v" or  name[-1] == "adam_m":
+            print("Skipping {}".format("/".join(name)))
             continue
         pointer = model
         for m_name in name: