diff --git a/examples/run_glue.py b/examples/run_glue.py
index 278f5c723a..99821f454d 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -154,8 +154,8 @@ def train(args, train_dataset, model, tokenizer):
 
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
-                scheduler.step()  # Update learning rate schedule
                 optimizer.step()
+                scheduler.step()  # Update learning rate schedule
                 model.zero_grad()
                 global_step += 1
 
diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
new file mode 100644
index 0000000000..6f59d15286
--- /dev/null
+++ b/examples/run_tf_glue.py
@@ -0,0 +1,69 @@
+import tensorflow as tf
+import tensorflow_datasets
+from pytorch_transformers import BertTokenizer, BertForSequenceClassification, TFBertForSequenceClassification, glue_convert_examples_to_features
+
+# Load tokenizer, model, dataset
+tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+tf_model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
+dataset = tensorflow_datasets.load("glue/mrpc")
+
+# Prepare dataset for GLUE
+train_dataset = glue_convert_examples_to_features(dataset['train'], tokenizer, task='mrpc', max_length=128)
+valid_dataset = glue_convert_examples_to_features(dataset['validation'], tokenizer, task='mrpc', max_length=128)
+train_dataset = train_dataset.shuffle(100).batch(32).repeat(3)
+valid_dataset = valid_dataset.batch(64)
+
+# Compile tf.keras model for training
+learning_rate = tf.keras.optimizers.schedules.PolynomialDecay(2e-5, 345, end_learning_rate=0)
+loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+tf_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08, clipnorm=1.0),
+              loss=loss, metrics=['sparse_categorical_accuracy'])
+
+# Train and evaluate using tf.keras.Model.fit()
+tf_model.fit(train_dataset, epochs=3, steps_per_epoch=115, validation_data=valid_dataset, validation_steps=7)
+
+# Save the model and load it in PyTorch
+tf_model.save_pretrained('./runs/')
+pt_model = BertForSequenceClassification.from_pretrained('./runs/')
+
+# Quickly inspect a few predictions
+
+
+# Divers
+import torch
+
+import tensorflow as tf
+import tensorflow_datasets
+from pytorch_transformers import BertTokenizer, BertForSequenceClassification, TFBertForSequenceClassification, glue_convert_examples_to_features
+
+# Load tokenizer, model, dataset
+tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
+
+pt_train_dataset = torch.load('../../data/glue_data//MRPC/cached_train_bert-base-cased_128_mrpc')
+
+def gen():
+    for el in pt_train_dataset:
+        yield ((el.input_ids, el.attention_mask, el.token_type_ids), (el.label,))
+
+dataset = tf.data.Dataset.from_generator(gen,
+            ((tf.int32, tf.int32, tf.int32), (tf.int64,)),
+            ((tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None])),
+             (tf.TensorShape([]),)))
+
+dataset = dataset.shuffle(100).batch(32)
+next(iter(dataset))
+
+learning_rate = tf.keras.optimizers.schedules.PolynomialDecay(2e-5, 345, 0)
+loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+model.compile(optimizer=tf.keras.optimizers.Adam(
+                  learning_rate=learning_rate,
+                  epsilon=1e-08,
+                  clipnorm=1.0),
+              loss=loss,
+              metrics=[['sparse_categorical_accuracy']])
+
+tensorboard_cbk = tf.keras.callbacks.TensorBoard(log_dir='./runs/', update_freq=10, histogram_freq=1)
+
+# Train model
+model.fit(dataset, epochs=3, callbacks=[tensorboard_cbk])
diff --git a/pytorch_transformers/configuration_utils.py b/pytorch_transformers/configuration_utils.py
index fb1fe82f43..649a94e28c 100644
--- a/pytorch_transformers/configuration_utils.py
+++ b/pytorch_transformers/configuration_utils.py
@@ -67,6 +67,7 @@ class PretrainedConfig(object):
         output_config_file = os.path.join(save_directory, CONFIG_NAME)
 
         self.to_json_file(output_config_file)
+        logger.info("Configuration saved in {}".format(output_config_file))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
diff --git a/pytorch_transformers/data/processors/utils.py b/pytorch_transformers/data/processors/utils.py
index ed85f4a1f4..a616372054 100644
--- a/pytorch_transformers/data/processors/utils.py
+++ b/pytorch_transformers/data/processors/utils.py
@@ -17,6 +17,7 @@
 import csv
 import sys
 import copy
+import json
 
 class InputExample(object):
     """A single training/test example for simple sequence classification."""
diff --git a/pytorch_transformers/modeling_tf_utils.py b/pytorch_transformers/modeling_tf_utils.py
index 2186e2d488..21faee6616 100644
--- a/pytorch_transformers/modeling_tf_utils.py
+++ b/pytorch_transformers/modeling_tf_utils.py
@@ -132,8 +132,8 @@ class TFPreTrainedModel(tf.keras.Model):
 
         # If we save using the predefined names, we can load using `from_pretrained`
         output_model_file = os.path.join(save_directory, TF2_WEIGHTS_NAME)
-
         self.save_weights(output_model_file)
+        logger.info("Model weights saved in {}".format(output_model_file))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index af33c22d6e..00e1156125 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -201,8 +201,8 @@ class PreTrainedModel(nn.Module):
 
         # If we save using the predefined names, we can load using `from_pretrained`
         output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
-
         torch.save(model_to_save.state_dict(), output_model_file)
+        logger.info("Model weights saved in {}".format(output_model_file))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -305,7 +305,7 @@ class PreTrainedModel(nn.Module):
                     archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
                 else:
                     raise EnvironmentError("Error no file named {} found in directory {}".format(
-                        tuple(WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"),
+                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
                         pretrained_model_name_or_path))
             elif os.path.isfile(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path