From 352e3ff998c7e743439f3d42086ce17edea1abbf Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 16 Jul 2019 09:03:49 +0200 Subject: [PATCH] added migration guide to readme --- README.md | 102 ++++++++++++++++++++++++- pytorch_transformers/modeling_utils.py | 3 +- 2 files changed, 101 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 89a0d7a4fe..4b56f24920 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,8 @@ These implementations have been tested on several datasets (see the example scri |-|-| | [Installation](#installation) | How to install the package | | [Quick tour: Usage](#quick-tour-usage) | Tokenizers & models usage: Bert and GPT-2 | -| [Quick tour: Fine-tuning/usage scripts](#quick-tour-fine-tuning/usage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation | +| [Quick tour: Fine-tuning/usage scripts](#quick-tour-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation | +| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers | | [Documentation](#documentation) | Full API documentation and more | ## Installation @@ -350,7 +351,102 @@ python ./examples/run_glue.py \ The full documentation is available at https://huggingface.co/pytorch-transformers/. +## Migrating from pytorch-pretrained-bert to pytorch-transformers + +Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` + +### Models always output `tuples` + +The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters. + +The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/). + +In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`. + +Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model: + +```python +# Let's load our model +model = BertForSequenceClassification.from_pretrained('bert-base-uncased') + +# If you used to have this line in pytorch-pretrained-bert: +loss = model(input_ids, labels=labels) + +# Now just use this line in pytorch-transformers to extract the loss from the output tuple: +outputs = model(input_ids, labels=labels) +loss = outputs[0] + +# In pytorch-transformers you can also have access to the logits: +loss, logits = outputs[:2] + +# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation) +model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True) +outputs = model(input_ids, labels=labels) +loss, logits, attentions = outputs +``` + +### Serialization + +While not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other seralization method before. + +Here is an example: + +```python +### Let's load a model and tokenizer +model = BertForSequenceClassification.from_pretrained('bert-base-uncased') +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + +### Do some stuff to our model and tokenizer +# Ex: add new tokens to the vocabulary and embeddings of our model +tokenizer.add_tokens(['[SPECIAL_TOKEN_1]', '[SPECIAL_TOKEN_2]']) +model.resize_token_embeddings(len(tokenizer)) +# Train our model +train(model) + +### Now let's save our model and tokenizer to a directory +model.save_pretrained('./my_saved_model_directory/') +tokenizer.save_pretrained('./my_saved_model_directory/') + +### Reload the model and the tokenizer +model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/') +tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/') +``` + +### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules + +The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer. +The new optimizer `AdamW` matches PyTorch `Adam` optimizer API. + +The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore. + +Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule: + +```python +# Parameters: +lr = 1e-3 +num_total_steps = 1000 +num_warmup_steps = 100 +warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1 + +### Previously BertAdam optimizer was instantiated like this: +optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps) +### and used like this: +for batch in train_data: + loss = model(batch) + loss.backward() + optimizer.step() + +### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this: +optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False +scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler +### and used like this: +for batch in train_data: + loss = model(batch) + loss.backward() + scheduler.step() + optimizer.step() +``` + ## Citation -At the moment, there is no paper to cite for PyTorch-Transformers but we are working on preparing one. -In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project. +At the moment, there is no paper associated to PyTorch-Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project. diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py index 76a6e00db0..542c70b223 100644 --- a/pytorch_transformers/modeling_utils.py +++ b/pytorch_transformers/modeling_utils.py @@ -259,7 +259,8 @@ class PreTrainedModel(nn.Module): New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end Reducing the size will remove vectors from the end - If not provided or None: does nothing. + If not provided or None: does nothing and just returns a pointer to the input tokens Embedding Module of the model. + Return: ``torch.nn.Embeddings`` Pointer to the input tokens Embedding Module of the model """