From edc79acb3b2e46cafd2b765a988639a7f0611e6f Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 16 Jul 2019 16:02:32 +0200 Subject: [PATCH] simpler quick tour --- README.md | 168 +++++++++++++++--------------------------------------- 1 file changed, 46 insertions(+), 122 deletions(-) diff --git a/README.md b/README.md index 4967cd68c7..04ce7d45ed 100644 --- a/README.md +++ b/README.md @@ -56,141 +56,65 @@ python -m pytest -sv ./pytorch_transformers/tests/ python -m pytest -sv ./examples/ ``` -## Quick tour: Usage +## Quick tour -Here are two quick-start examples using `Bert` and `GPT2` with pre-trained models. - -See the [documentation](#documentation) for the details of all the models and classes. - -### BERT example - -First let's prepare a tokenized input from a text string using `BertTokenizer` +Let's do a very quick overview of PyTorch-Transformers. Detailled examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/). ```python import torch -from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM +from pytorch_transformers import * -# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows -import logging -logging.basicConfig(level=logging.INFO) +# PyTorch-Transformers has a unified API +# for 6 transformer architectures and 27 pretrained weights. +# Model | Tokenizer | Pretrained weights shortcut +MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'), + (OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'), + (GPT2Model, GPT2Tokenizer, 'gpt2'), + (TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'), + (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'), + (XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024')] -# Load pre-trained model tokenizer (vocabulary) +# Let's encode some text in a sequence of hidden-states using each model: +for model_class, tokenizer_class, pretrained_weights in MODELS: + # Load pretrained model/tokenizer + tokenizer = tokenizer_class.from_pretrained(pretrained_weights) + model = model_class.from_pretrained(pretrained_weights) + + # Encode text + input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")]) + last_hidden_states = model(input_ids)[0] # Models outputs are now tuples + +# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g. +BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction, + BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification, + BertForQuestionAnswering] + +# All the classes for an architecture can be loaded from pretrained weights for this architecture +# Note that additional weights added for fine-tuning are only initialized and need to be trained on the down-stream task tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') +for model_class in BERT_MODEL_CLASSES: + # Load pretrained model/tokenizer + model = model_class.from_pretrained('bert-base-uncased') -# Tokenize input -text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" -tokenized_text = tokenizer.tokenize(text) +# Models can return full list of hidden-states & attentions weights at each layer +model = model_class.from_pretrained(pretrained_weights, output_hidden_states=True, output_attentions=True) +input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")]) +all_hidden_states, all_attentions = model(input_ids)[-2:] -# Mask a token that we will try to predict back with `BertForMaskedLM` -masked_index = 8 -tokenized_text[masked_index] = '[MASK]' -assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]'] +# Models are compatible with Torchscript +model = model_class.from_pretrained(pretrained_weights, torchscript=True) +traced_model = torch.jit.trace(model, (input_ids,)) -# Convert token to vocabulary indices -indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) -# Define sentence A and B indices associated to 1st and 2nd sentences (see paper) -segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] +# Simple serialization for models and tokenizers +model.save_pretrained('./directory/to/save/') # save +model = model_class.from_pretrained('./directory/to/save/') # re-load +tokenizer.save_pretrained('./directory/to/save/') # save +tokenizer = tokenizer_class.from_pretrained(pretrained_weights) -# Convert inputs to PyTorch tensors -tokens_tensor = torch.tensor([indexed_tokens]) -segments_tensors = torch.tensor([segments_ids]) +# SOTA examples for GLUE, SQUAD, text generation... ``` -Let's see how we can use `BertModel` to encode our inputs in hidden-states: - -```python -# Load pre-trained model (weights) -model = BertModel.from_pretrained('bert-base-uncased') - -# If you have a GPU, put everything on cuda -tokens_tensor = tokens_tensor.to('cuda') -segments_tensors = segments_tensors.to('cuda') -model.to('cuda') - -# Predict hidden states features for each layer -with torch.no_grad(): - # See the models docstrings for the detail of the inputs - outputs = model(tokens_tensor, token_type_ids=segments_tensors) - # PyTorch-Transformers models always output tuples. - # See the models docstrings for the detail of all the outputs - # In our case, the first element is the hidden state of the last layer of the Bert model - encoded_layers = outputs[0] - -# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension) -assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size) -``` - -And how to use `BertForMaskedLM` to predict a masked token: - -```python -# Load pre-trained model (weights) -model = BertForMaskedLM.from_pretrained('bert-base-uncased') - -# If you have a GPU, put everything on cuda -tokens_tensor = tokens_tensor.to('cuda') -segments_tensors = segments_tensors.to('cuda') -model.to('cuda') - -# Predict all tokens -with torch.no_grad(): - outputs = model(tokens_tensor, token_type_ids=segments_tensors) - predictions = outputs[0] - -# confirm we were able to predict 'henson' -predicted_index = torch.argmax(predictions[0, masked_index]).item() -predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] -assert predicted_token == 'henson' -``` - -### OpenAI GPT-2 - -Here is a quick-start example using `GPT2Tokenizer` and `GPT2LMHeadModel` class with OpenAI's pre-trained model to predict the next token from a text prompt. - -First let's prepare a tokenized input from our text string using `GPT2Tokenizer` - -```python -import torch -from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel - -# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows -import logging -logging.basicConfig(level=logging.INFO) - -# Load pre-trained model tokenizer (vocabulary) -tokenizer = GPT2Tokenizer.from_pretrained('gpt2') - -# Encode a text inputs -text = "Who was Jim Henson ? Jim Henson was a" -indexed_tokens = tokenizer.encode(text) - -# Convert indexed tokens in a PyTorch tensor -tokens_tensor = torch.tensor([indexed_tokens]) -``` - -Let's see how to use `GPT2LMHeadModel` to generate the next token following our text: - -```python -# Load pre-trained model (weights) -model = GPT2LMHeadModel.from_pretrained('gpt2') - -# If you have a GPU, put everything on cuda -tokens_tensor = tokens_tensor.to('cuda') -model.to('cuda') - -# Predict all tokens -with torch.no_grad(): - outputs = model(tokens_tensor) - predictions = outputs[0] - -# get the predicted next sub-word (in our case, the word 'man') -predicted_index = torch.argmax(predictions[0, -1, :]).item() -predicted_text = tokenizer.decode(indexed_tokens + [predicted_index]) -assert predicted_text == 'Who was Jim Henson? Jim Henson was a man' -``` - -Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation). - -## Quick tour: Fine-tuning/usage scripts +## Quick tour of the fine-tuning/usage scripts The library comprises several example scripts with SOTA performances for NLU and NLG tasks: