From edc79acb3b2e46cafd2b765a988639a7f0611e6f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 16 Jul 2019 16:02:32 +0200
Subject: [PATCH] simpler quick tour

---
 README.md | 168 +++++++++++++++---------------------------------------
 1 file changed, 46 insertions(+), 122 deletions(-)

diff --git a/README.md b/README.md
index 4967cd68c7..04ce7d45ed 100644
--- a/README.md
+++ b/README.md
@@ -56,141 +56,65 @@ python -m pytest -sv ./pytorch_transformers/tests/
 python -m pytest -sv ./examples/
 ```
 
-## Quick tour: Usage
+## Quick tour
 
-Here are two quick-start examples using `Bert` and `GPT2` with pre-trained models.
-
-See the [documentation](#documentation) for the details of all the models and classes.
-
-### BERT example
-
-First let's prepare a tokenized input from a text string using `BertTokenizer`
+Let's do a very quick overview of PyTorch-Transformers. Detailled examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/).
 
 ```python
 import torch
-from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
+from pytorch_transformers import *
 
-# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
+# PyTorch-Transformers has a unified API
+# for 6 transformer architectures and 27 pretrained weights.
+#          Model          | Tokenizer          | Pretrained weights shortcut
+MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
+          (OpenAIGPTModel,  OpenAIGPTTokenizer, 'openai-gpt'),
+          (GPT2Model,       GPT2Tokenizer,      'gpt2'),
+          (TransfoXLModel,  TransfoXLTokenizer, 'transfo-xl-wt103'),
+          (XLNetModel,      XLNetTokenizer,     'xlnet-base-cased'),
+          (XLMModel,        XLMTokenizer,       'xlm-mlm-enfr-1024')]
 
-# Load pre-trained model tokenizer (vocabulary)
+# Let's encode some text in a sequence of hidden-states using each model:
+for model_class, tokenizer_class, pretrained_weights in MODELS:
+    # Load pretrained model/tokenizer
+    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
+    model = model_class.from_pretrained(pretrained_weights)
+
+    # Encode text
+    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")])
+    last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
+
+# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
+BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
+                      BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification,
+                      BertForQuestionAnswering]
+
+# All the classes for an architecture can be loaded from pretrained weights for this architecture
+# Note that additional weights added for fine-tuning are only initialized and need to be trained on the down-stream task
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+for model_class in BERT_MODEL_CLASSES:
+    # Load pretrained model/tokenizer
+    model = model_class.from_pretrained('bert-base-uncased')
 
-# Tokenize input
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = tokenizer.tokenize(text)
+# Models can return full list of hidden-states & attentions weights at each layer
+model = model_class.from_pretrained(pretrained_weights, output_hidden_states=True, output_attentions=True)
+input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
+all_hidden_states, all_attentions = model(input_ids)[-2:]
 
-# Mask a token that we will try to predict back with `BertForMaskedLM`
-masked_index = 8
-tokenized_text[masked_index] = '[MASK]'
-assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
+# Models are compatible with Torchscript
+model = model_class.from_pretrained(pretrained_weights, torchscript=True)
+traced_model = torch.jit.trace(model, (input_ids,))
 
-# Convert token to vocabulary indices
-indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+# Simple serialization for models and tokenizers
+model.save_pretrained('./directory/to/save/')  # save
+model = model_class.from_pretrained('./directory/to/save/')  # re-load
+tokenizer.save_pretrained('./directory/to/save/')  # save
+tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
 
-# Convert inputs to PyTorch tensors
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
+# SOTA examples for GLUE, SQUAD, text generation...
 ```
 
-Let's see how we can use `BertModel` to encode our inputs in hidden-states:
-
-```python
-# Load pre-trained model (weights)
-model = BertModel.from_pretrained('bert-base-uncased')
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-segments_tensors = segments_tensors.to('cuda')
-model.to('cuda')
-
-# Predict hidden states features for each layer
-with torch.no_grad():
-    # See the models docstrings for the detail of the inputs
-    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    # PyTorch-Transformers models always output tuples.
-    # See the models docstrings for the detail of all the outputs
-    # In our case, the first element is the hidden state of the last layer of the Bert model
-    encoded_layers = outputs[0]
-
-# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
-assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
-```
-
-And how to use `BertForMaskedLM` to predict a masked token:
-
-```python
-# Load pre-trained model (weights)
-model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-segments_tensors = segments_tensors.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    predictions = outputs[0]
-
-# confirm we were able to predict 'henson'
-predicted_index = torch.argmax(predictions[0, masked_index]).item()
-predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-assert predicted_token == 'henson'
-```
-
-### OpenAI GPT-2
-
-Here is a quick-start example using `GPT2Tokenizer` and `GPT2LMHeadModel` class with OpenAI's pre-trained model to predict the next token from a text prompt.
-
-First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
-
-```python
-import torch
-from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
-
-# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-
-# Encode a text inputs
-text = "Who was Jim Henson ? Jim Henson was a"
-indexed_tokens = tokenizer.encode(text)
-
-# Convert indexed tokens in a PyTorch tensor
-tokens_tensor = torch.tensor([indexed_tokens])
-```
-
-Let's see how to use `GPT2LMHeadModel` to generate the next token following our text:
-
-```python
-# Load pre-trained model (weights)
-model = GPT2LMHeadModel.from_pretrained('gpt2')
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(tokens_tensor)
-    predictions = outputs[0]
-
-# get the predicted next sub-word (in our case, the word 'man')
-predicted_index = torch.argmax(predictions[0, -1, :]).item()
-predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
-assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'
-```
-
-Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation).
-
-## Quick tour: Fine-tuning/usage scripts
+## Quick tour of the fine-tuning/usage scripts
 
 The library comprises several example scripts with SOTA performances for NLU and NLG tasks: