From 86f23a19445a920619fceaf60a6ea6a94f253c48 Mon Sep 17 00:00:00 2001
From: Timothy Liu <timothyl@nvidia.com>
Date: Sun, 13 Oct 2019 10:21:35 +0000
Subject: [PATCH 1/3] Minor enhancements to run_tf_glue.py

---
 examples/run_tf_glue.py | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
index f2e94ae39e..c05420d680 100644
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -1,40 +1,55 @@
+import os
 import tensorflow as tf
 import tensorflow_datasets
 from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification
 
-# Load dataset, tokenizer, model from pretrained model/vocabulary
+# script parameters
+BATCH_SIZE = 32
+EVAL_BATCH_SIZE = BATCH_SIZE * 2
+
+# Load tokenizer and model from pretrained model/vocabulary
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
 model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
-data = tensorflow_datasets.load('glue/mrpc')
+
+# Load dataset via TensorFlow Datasets
+data, info = tensorflow_datasets.load('glue/mrpc', with_info=True)
+train_examples = info.splits['train'].num_examples
+valid_examples = info.splits['validation'].num_examples
 
 # Prepare dataset for GLUE as a tf.data.Dataset instance
 train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
 valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
-train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
-valid_dataset = valid_dataset.batch(64)
+train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
+valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
 
 # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
-optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
 loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
 model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
 # Train and evaluate using tf.keras.Model.fit()
-history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
-                    validation_data=valid_dataset, validation_steps=7)
+train_steps = train_examples//BATCH_SIZE
+valid_steps = valid_examples//EVAL_BATCH_SIZE
+
+history = model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps,
+                    validation_data=valid_dataset, validation_steps=valid_steps)
+
+# Save TF2 model
+os.makedirs('./save/', exist_ok=True)
+model.save_pretrained('./save/')
 
 # Load the TensorFlow model in PyTorch for inspection
-model.save_pretrained('./save/')
 pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
 
 # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-sentence_0 = "This research was consistent with his findings."
-sentence_1 = "His findings were compatible with this research."
-sentence_2 = "His findings were not compatible with this research."
+sentence_0 = 'This research was consistent with his findings.'
+sentence_1 = 'His findings were compatible with this research.'
+sentence_2 = 'His findings were not compatible with this research.'
 inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
 inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
 
 pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
 pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
-print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
-print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
+print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
+print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')

From 376e65a67481bcd370c77b119773b11bb612b0c3 Mon Sep 17 00:00:00 2001
From: Timothy Liu <timothyl@nvidia.com>
Date: Sun, 13 Oct 2019 11:04:49 +0000
Subject: [PATCH 2/3] Added automatic mixed precision and XLA options to
 run_tf_glue.py

---
 examples/run_tf_glue.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
index c05420d680..399fe9e616 100644
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -6,6 +6,11 @@ from transformers import BertTokenizer, TFBertForSequenceClassification, glue_co
 # script parameters
 BATCH_SIZE = 32
 EVAL_BATCH_SIZE = BATCH_SIZE * 2
+USE_XLA = False
+USE_AMP = False
+
+tf.config.optimizer.set_jit(USE_XLA)
+tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
 
 # Load tokenizer and model from pretrained model/vocabulary
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
@@ -23,10 +28,13 @@ train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
 valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
 
 # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
-optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
+opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
+if USE_AMP:
+    # loss scaling is currently required when using mixed precision
+    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
 loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
-model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+model.compile(optimizer=opt, loss=loss, metrics=[metric])
 
 # Train and evaluate using tf.keras.Model.fit()
 train_steps = train_examples//BATCH_SIZE

From 2c1d5564ad8e7d937bccf500a12e95423f4b6545 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 15 Oct 2019 09:56:52 +0200
Subject: [PATCH 3/3] add readme information

---
 examples/README.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index 382d794fcb..9465b9ad82 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -5,12 +5,35 @@ similar API between the different models.
 
 | Section                    | Description                                                                                                                                                |
 |----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks. 
 | [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
 | [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
 | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
 | [SQuAD](#squad) | Using BERT for question answering, examples with distributed training.                                                                                  |
 | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
 
+## TensorFlow 2.0 Bert models on GLUE
+
+Based on the script [`run_tf_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_glue.py).
+
+Fine-tuning the library TensorFlow 2.0 Bert model for sequence classification on the  MRPC task of the GLUE benchmark: [General Language Understanding Evaluation](https://gluebenchmark.com/).
+
+This script has an option for mixed precision (Automatic Mixed Precision / AMP) to run models on Tensor Cores (NVIDIA Volta/Turing GPUs) and future hardware and an option for XLA, which uses the XLA compiler to reduce model runtime.
+Options are toggled using `USE_XLA` or `USE_AMP` variables in the script.
+These options and the below benchmark are provided by @tlkh.
+
+Quick benchmarks from the script (no other modifications):
+
+| GPU    | Mode | Time (2nd epoch) | Val Acc (3 runs) |
+| --------- | -------- | ----------------------- | ----------------------|
+| Titan V | FP32 | 41s | 0.8438/0.8281/0.8333 |
+| Titan V | AMP | 26s | 0.8281/0.8568/0.8411 |
+| V100    | FP32 | 35s | 0.8646/0.8359/0.8464 |
+| V100    | AMP | 22s | 0.8646/0.8385/0.8411 |
+| 1080 Ti | FP32 | 55s | - | 
+
+Mixed precision (AMP) reduces the training time considerably for the same hardware and hyper-parameters (same batch size was used).
+
 ## Language model fine-tuning
 
 Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).