From 352e3ff998c7e743439f3d42086ce17edea1abbf Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 16 Jul 2019 09:03:49 +0200
Subject: [PATCH] added migration guide to readme

---
 README.md                              | 102 ++++++++++++++++++++++++-
 pytorch_transformers/modeling_utils.py |   3 +-
 2 files changed, 101 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 89a0d7a4fe..4b56f24920 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,8 @@ These implementations have been tested on several datasets (see the example scri
 |-|-|
 | [Installation](#installation) | How to install the package |
 | [Quick tour: Usage](#quick-tour-usage) | Tokenizers & models usage: Bert and GPT-2 |
-| [Quick tour: Fine-tuning/usage scripts](#quick-tour-fine-tuning/usage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
+| [Quick tour: Fine-tuning/usage scripts](#quick-tour-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
+| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
 | [Documentation](#documentation) | Full API documentation and more |
 
 ## Installation
@@ -350,7 +351,102 @@ python ./examples/run_glue.py \
 
 The full documentation is available at https://huggingface.co/pytorch-transformers/.
 
+## Migrating from pytorch-pretrained-bert to pytorch-transformers
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+
+### Models always output `tuples`
+
+The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+
+The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+
+In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
+
+Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+
+```python
+# Let's load our model
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+# If you used to have this line in pytorch-pretrained-bert:
+loss = model(input_ids, labels=labels)
+
+# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+outputs = model(input_ids, labels=labels)
+loss = outputs[0]
+
+# In pytorch-transformers you can also have access to the logits:
+loss, logits = outputs[:2]
+
+# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
+outputs = model(input_ids, labels=labels)
+loss, logits, attentions = outputs
+```
+
+### Serialization
+
+While not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other seralization method before.
+
+Here is an example:
+
+```python
+### Let's load a model and tokenizer
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+### Do some stuff to our model and tokenizer
+# Ex: add new tokens to the vocabulary and embeddings of our model
+tokenizer.add_tokens(['[SPECIAL_TOKEN_1]', '[SPECIAL_TOKEN_2]'])
+model.resize_token_embeddings(len(tokenizer))
+# Train our model
+train(model)
+
+### Now let's save our model and tokenizer to a directory
+model.save_pretrained('./my_saved_model_directory/')
+tokenizer.save_pretrained('./my_saved_model_directory/')
+
+### Reload the model and the tokenizer
+model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
+tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')
+```
+
+### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
+
+The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer.
+The new optimizer `AdamW` matches PyTorch `Adam` optimizer API.
+
+The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
+
+Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule:
+
+```python
+# Parameters:
+lr = 1e-3
+num_total_steps = 1000
+num_warmup_steps = 100
+warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
+
+### Previously BertAdam optimizer was instantiated like this:
+optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
+### and used like this:
+for batch in train_data:
+    loss = model(batch)
+    loss.backward()
+    optimizer.step()
+
+### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
+scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
+### and used like this:
+for batch in train_data:
+    loss = model(batch)
+    loss.backward()
+    scheduler.step()
+    optimizer.step()
+```
+
 ## Citation
 
-At the moment, there is no paper to cite for PyTorch-Transformers but we are working on preparing one.
-In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
+At the moment, there is no paper associated to PyTorch-Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 76a6e00db0..542c70b223 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -259,7 +259,8 @@ class PreTrainedModel(nn.Module):
                 New number of tokens in the embedding matrix.
                 Increasing the size will add newly initialized vectors at the end
                 Reducing the size will remove vectors from the end
-                If not provided or None: does nothing.
+                If not provided or None: does nothing and just returns a pointer to the input tokens Embedding Module of the model.
+
         Return: ``torch.nn.Embeddings``
             Pointer to the input tokens Embedding Module of the model
         """