From e5c78c6684b29b0954f326c4f07926987921ba38 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 10 Jan 2019 01:40:00 +0100 Subject: [PATCH] update readme and few typos --- README.md | 8 ++++---- examples/extract_features.py | 4 ++-- pytorch_pretrained_bert/modeling.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 54291e0779..be0765f4bb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# PyTorch Pretrained Bert - PyTorch Pretrained OpenAI GPT +# PyTorch Pretrained Bert (also with PyTorch Pretrained OpenAI GPT) [![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT) @@ -125,18 +125,18 @@ from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Tokenized input -text = "Who was Jim Henson ? Jim Henson was a puppeteer" +text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" tokenized_text = tokenizer.tokenize(text) # Mask a token that we will try to predict back with `BertForMaskedLM` masked_index = 6 tokenized_text[masked_index] = '[MASK]' -assert tokenized_text == ['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer'] +assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]'] # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) -segments_ids = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] +segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) diff --git a/examples/extract_features.py b/examples/extract_features.py index 9d05d7905d..593576bdcb 100644 --- a/examples/extract_features.py +++ b/examples/extract_features.py @@ -80,10 +80,10 @@ def convert_examples_to_features(examples, seq_length, tokenizer): # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 + # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index 6a05873b20..591082f7ce 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -584,7 +584,7 @@ class BertModel(BertPreTrainedModel): to the last attention block of shape [batch_size, sequence_length, hidden_size], `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a classifier pretrained on top of the hidden state associated to the first character of the - input (`CLF`) to train on the Next-Sentence task (see BERT's paper). + input (`CLS`) to train on the Next-Sentence task (see BERT's paper). Example usage: ```python