From 4447f270b2ad30eeb374b5171913f3050340c506 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 17 Jun 2019 16:21:28 +0200 Subject: [PATCH] updating hub --- README.md | 45 ++++++++++++++++++++++++++++++++++++++++ hubconfs/bert_hubconf.py | 12 +++++++++++ hubconfs/gpt2_hubconf.py | 14 ++++++++----- hubconfs/gpt_hubconf.py | 17 ++++++++------- 4 files changed, 76 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index fa186adf1e..9ddeba808e 100644 --- a/README.md +++ b/README.md @@ -309,6 +309,28 @@ predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] assert predicted_token == '.' ``` +And how to use `OpenAIGPTDoubleHeadsModel` + +```python +# Load pre-trained model (weights) +model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') +model.eval() + +# Prepare tokenized input +text1 = "Who was Jim Henson ? Jim Henson was a puppeteer" +text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man" +tokenized_text1 = tokenizer.tokenize(text1) +tokenized_text2 = tokenizer.tokenize(text2) +indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1) +indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2) +tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]]) +mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]]) + +# Predict hidden states features for each layer +with torch.no_grad(): + lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids) +``` + ### Transformer-XL Here is a quick-start example using `TransfoXLTokenizer`, `TransfoXLModel` and `TransfoXLModelLMHeadModel` class with the Transformer-XL model pre-trained on WikiText-103. See the [doc section](#doc) below for all the details on these classes. @@ -456,6 +478,29 @@ predicted_index = torch.argmax(predictions_2[0, -1, :]).item() predicted_token = tokenizer.decode([predicted_index]) ``` +And how to use `GPT2DoubleHeadsModel` + +```python +# Load pre-trained model (weights) +model = GPT2DoubleHeadsModel.from_pretrained('gpt2') +model.eval() + +# Prepare tokenized input +text1 = "Who was Jim Henson ? Jim Henson was a puppeteer" +text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man" +tokenized_text1 = tokenizer.tokenize(text1) +tokenized_text2 = tokenizer.tokenize(text2) +indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1) +indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2) +tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]]) +mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]]) + +# Predict hidden states features for each layer +with torch.no_grad(): + lm_logits, multiple_choice_logits, past = model(tokens_tensor, mc_token_ids) +``` + + ## Doc Here is a detailed documentation of the classes in the package and how to use them: diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py index 0595bdeccb..3769c2567f 100644 --- a/hubconfs/bert_hubconf.py +++ b/hubconfs/bert_hubconf.py @@ -23,6 +23,9 @@ bert_docstring = """ . `bert-base-multilingual-uncased` . `bert-base-multilingual-cased` . `bert-base-chinese` + . `bert-base-german-cased` + . `bert-large-uncased-whole-word-masking` + . `bert-large-cased-whole-word-masking` - a path or url to a pretrained model archive containing: . `bert_config.json` a configuration file for the model . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining @@ -81,6 +84,7 @@ def bertTokenizer(*args, **kwargs): Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"] Example: + >>> import torch >>> sentence = 'Hello, World!' >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) >>> toks = tokenizer.tokenize(sentence) @@ -101,6 +105,7 @@ def bertModel(*args, **kwargs): Example: # Load the tokenizer + >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" @@ -129,6 +134,7 @@ def bertForNextSentencePrediction(*args, **kwargs): Example: # Load the tokenizer + >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" @@ -158,6 +164,7 @@ def bertForPreTraining(*args, **kwargs): Example: # Load the tokenizer + >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" @@ -181,6 +188,7 @@ def bertForMaskedLM(*args, **kwargs): Example: # Load the tokenizer + >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" @@ -222,6 +230,7 @@ def bertForSequenceClassification(*args, **kwargs): Example: # Load the tokenizer + >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" @@ -256,6 +265,7 @@ def bertForMultipleChoice(*args, **kwargs): Example: # Load the tokenizer + >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" @@ -288,6 +298,7 @@ def bertForQuestionAnswering(*args, **kwargs): Example: # Load the tokenizer + >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" @@ -326,6 +337,7 @@ def bertForTokenClassification(*args, **kwargs): Example: # Load the tokenizer + >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py index 26b53e8b03..3ac8bc72ab 100644 --- a/hubconfs/gpt2_hubconf.py +++ b/hubconfs/gpt2_hubconf.py @@ -11,7 +11,7 @@ gpt2_docstring = """ Params: pretrained_model_name_or_path: either: - a str with the name of a pre-trained model to load selected in the list of: - . `gpt2` + . `gpt2`, `gpt2-medium` - a path or url to a pretrained model archive containing: . `gpt2_config.json` a configuration file for the model . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance @@ -147,10 +147,14 @@ def gpt2DoubleHeadsModel(*args, **kwargs): >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2') # Prepare tokenized input - >>> text = "Who was Jim Henson ?" - >>> indexed_tokens = tokenizer.encode(text) - >>> tokens_tensor = torch.tensor([indexed_tokens]) - >>> mc_token_ids = torch.LongTensor([ [len(indexed_tokens)] ]) + >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer" + >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man" + >>> tokenized_text1 = tokenizer.tokenize(text1) + >>> tokenized_text2 = tokenizer.tokenize(text2) + >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1) + >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2) + >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]]) + >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]]) # Load gpt2DoubleHeadsModel >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2') diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py index 77162dc244..f3d03888ae 100644 --- a/hubconfs/gpt_hubconf.py +++ b/hubconfs/gpt_hubconf.py @@ -126,7 +126,7 @@ def openAIGPTLMHeadModel(*args, **kwargs): Example: # Load the tokenizer - >>> import torch + >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt') # Prepare tokenized input @@ -161,15 +161,18 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs): Example: # Load the tokenizer - >>> import torch + >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt') # Prepare tokenized input - >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer" - >>> tokenized_text = tokenizer.tokenize(text) - >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) - >>> tokens_tensor = torch.tensor([indexed_tokens]) - >>> mc_token_ids = torch.LongTensor([ [len(tokenized_text)] ]) + >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer" + >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man" + >>> tokenized_text1 = tokenizer.tokenize(text1) + >>> tokenized_text2 = tokenizer.tokenize(text2) + >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1) + >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2) + >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]]) + >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]]) # Load openAIGPTDoubleHeadsModel >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')