simpler quick tour
This commit is contained in:
168
README.md
168
README.md
@@ -56,141 +56,65 @@ python -m pytest -sv ./pytorch_transformers/tests/
|
|||||||
python -m pytest -sv ./examples/
|
python -m pytest -sv ./examples/
|
||||||
```
|
```
|
||||||
|
|
||||||
## Quick tour: Usage
|
## Quick tour
|
||||||
|
|
||||||
Here are two quick-start examples using `Bert` and `GPT2` with pre-trained models.
|
Let's do a very quick overview of PyTorch-Transformers. Detailled examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/).
|
||||||
|
|
||||||
See the [documentation](#documentation) for the details of all the models and classes.
|
|
||||||
|
|
||||||
### BERT example
|
|
||||||
|
|
||||||
First let's prepare a tokenized input from a text string using `BertTokenizer`
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
|
from pytorch_transformers import *
|
||||||
|
|
||||||
# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
|
# PyTorch-Transformers has a unified API
|
||||||
import logging
|
# for 6 transformer architectures and 27 pretrained weights.
|
||||||
logging.basicConfig(level=logging.INFO)
|
# Model | Tokenizer | Pretrained weights shortcut
|
||||||
|
MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
|
||||||
|
(OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
|
||||||
|
(GPT2Model, GPT2Tokenizer, 'gpt2'),
|
||||||
|
(TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
|
||||||
|
(XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
|
||||||
|
(XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024')]
|
||||||
|
|
||||||
# Load pre-trained model tokenizer (vocabulary)
|
# Let's encode some text in a sequence of hidden-states using each model:
|
||||||
|
for model_class, tokenizer_class, pretrained_weights in MODELS:
|
||||||
|
# Load pretrained model/tokenizer
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
|
||||||
|
model = model_class.from_pretrained(pretrained_weights)
|
||||||
|
|
||||||
|
# Encode text
|
||||||
|
input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")])
|
||||||
|
last_hidden_states = model(input_ids)[0] # Models outputs are now tuples
|
||||||
|
|
||||||
|
# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
|
||||||
|
BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
|
||||||
|
BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification,
|
||||||
|
BertForQuestionAnswering]
|
||||||
|
|
||||||
|
# All the classes for an architecture can be loaded from pretrained weights for this architecture
|
||||||
|
# Note that additional weights added for fine-tuning are only initialized and need to be trained on the down-stream task
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
|
for model_class in BERT_MODEL_CLASSES:
|
||||||
|
# Load pretrained model/tokenizer
|
||||||
|
model = model_class.from_pretrained('bert-base-uncased')
|
||||||
|
|
||||||
# Tokenize input
|
# Models can return full list of hidden-states & attentions weights at each layer
|
||||||
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
|
model = model_class.from_pretrained(pretrained_weights, output_hidden_states=True, output_attentions=True)
|
||||||
tokenized_text = tokenizer.tokenize(text)
|
input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
|
||||||
|
all_hidden_states, all_attentions = model(input_ids)[-2:]
|
||||||
|
|
||||||
# Mask a token that we will try to predict back with `BertForMaskedLM`
|
# Models are compatible with Torchscript
|
||||||
masked_index = 8
|
model = model_class.from_pretrained(pretrained_weights, torchscript=True)
|
||||||
tokenized_text[masked_index] = '[MASK]'
|
traced_model = torch.jit.trace(model, (input_ids,))
|
||||||
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
|
|
||||||
|
|
||||||
# Convert token to vocabulary indices
|
# Simple serialization for models and tokenizers
|
||||||
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
model.save_pretrained('./directory/to/save/') # save
|
||||||
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
|
model = model_class.from_pretrained('./directory/to/save/') # re-load
|
||||||
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
|
tokenizer.save_pretrained('./directory/to/save/') # save
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
|
||||||
|
|
||||||
# Convert inputs to PyTorch tensors
|
# SOTA examples for GLUE, SQUAD, text generation...
|
||||||
tokens_tensor = torch.tensor([indexed_tokens])
|
|
||||||
segments_tensors = torch.tensor([segments_ids])
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Let's see how we can use `BertModel` to encode our inputs in hidden-states:
|
## Quick tour of the fine-tuning/usage scripts
|
||||||
|
|
||||||
```python
|
|
||||||
# Load pre-trained model (weights)
|
|
||||||
model = BertModel.from_pretrained('bert-base-uncased')
|
|
||||||
|
|
||||||
# If you have a GPU, put everything on cuda
|
|
||||||
tokens_tensor = tokens_tensor.to('cuda')
|
|
||||||
segments_tensors = segments_tensors.to('cuda')
|
|
||||||
model.to('cuda')
|
|
||||||
|
|
||||||
# Predict hidden states features for each layer
|
|
||||||
with torch.no_grad():
|
|
||||||
# See the models docstrings for the detail of the inputs
|
|
||||||
outputs = model(tokens_tensor, token_type_ids=segments_tensors)
|
|
||||||
# PyTorch-Transformers models always output tuples.
|
|
||||||
# See the models docstrings for the detail of all the outputs
|
|
||||||
# In our case, the first element is the hidden state of the last layer of the Bert model
|
|
||||||
encoded_layers = outputs[0]
|
|
||||||
|
|
||||||
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
|
|
||||||
assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
|
|
||||||
```
|
|
||||||
|
|
||||||
And how to use `BertForMaskedLM` to predict a masked token:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Load pre-trained model (weights)
|
|
||||||
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
|
|
||||||
|
|
||||||
# If you have a GPU, put everything on cuda
|
|
||||||
tokens_tensor = tokens_tensor.to('cuda')
|
|
||||||
segments_tensors = segments_tensors.to('cuda')
|
|
||||||
model.to('cuda')
|
|
||||||
|
|
||||||
# Predict all tokens
|
|
||||||
with torch.no_grad():
|
|
||||||
outputs = model(tokens_tensor, token_type_ids=segments_tensors)
|
|
||||||
predictions = outputs[0]
|
|
||||||
|
|
||||||
# confirm we were able to predict 'henson'
|
|
||||||
predicted_index = torch.argmax(predictions[0, masked_index]).item()
|
|
||||||
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
|
|
||||||
assert predicted_token == 'henson'
|
|
||||||
```
|
|
||||||
|
|
||||||
### OpenAI GPT-2
|
|
||||||
|
|
||||||
Here is a quick-start example using `GPT2Tokenizer` and `GPT2LMHeadModel` class with OpenAI's pre-trained model to predict the next token from a text prompt.
|
|
||||||
|
|
||||||
First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
|
|
||||||
|
|
||||||
```python
|
|
||||||
import torch
|
|
||||||
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
|
|
||||||
|
|
||||||
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
|
|
||||||
import logging
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
|
|
||||||
# Load pre-trained model tokenizer (vocabulary)
|
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
|
||||||
|
|
||||||
# Encode a text inputs
|
|
||||||
text = "Who was Jim Henson ? Jim Henson was a"
|
|
||||||
indexed_tokens = tokenizer.encode(text)
|
|
||||||
|
|
||||||
# Convert indexed tokens in a PyTorch tensor
|
|
||||||
tokens_tensor = torch.tensor([indexed_tokens])
|
|
||||||
```
|
|
||||||
|
|
||||||
Let's see how to use `GPT2LMHeadModel` to generate the next token following our text:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Load pre-trained model (weights)
|
|
||||||
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
|
||||||
|
|
||||||
# If you have a GPU, put everything on cuda
|
|
||||||
tokens_tensor = tokens_tensor.to('cuda')
|
|
||||||
model.to('cuda')
|
|
||||||
|
|
||||||
# Predict all tokens
|
|
||||||
with torch.no_grad():
|
|
||||||
outputs = model(tokens_tensor)
|
|
||||||
predictions = outputs[0]
|
|
||||||
|
|
||||||
# get the predicted next sub-word (in our case, the word 'man')
|
|
||||||
predicted_index = torch.argmax(predictions[0, -1, :]).item()
|
|
||||||
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
|
|
||||||
assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'
|
|
||||||
```
|
|
||||||
|
|
||||||
Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation).
|
|
||||||
|
|
||||||
## Quick tour: Fine-tuning/usage scripts
|
|
||||||
|
|
||||||
The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
|
The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user