update readme and few typos

This commit is contained in:
thomwolf
2019-01-10 01:40:00 +01:00
parent fa5222c296
commit e5c78c6684
3 changed files with 7 additions and 7 deletions

View File

@@ -1,4 +1,4 @@
# PyTorch Pretrained Bert - PyTorch Pretrained OpenAI GPT # PyTorch Pretrained Bert (also with PyTorch Pretrained OpenAI GPT)
[![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT) [![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT)
@@ -125,18 +125,18 @@ from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenized input # Tokenized input
text = "Who was Jim Henson ? Jim Henson was a puppeteer" text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text) tokenized_text = tokenizer.tokenize(text)
# Mask a token that we will try to predict back with `BertForMaskedLM` # Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 6 masked_index = 6
tokenized_text[masked_index] = '[MASK]' tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer'] assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
# Convert token to vocabulary indices # Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
# Convert inputs to PyTorch tensors # Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = torch.tensor([indexed_tokens])

View File

@@ -80,10 +80,10 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
# The convention in BERT is: # The convention in BERT is:
# (a) For sequence pairs: # (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences: # (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP] # tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0 # type_ids: 0 0 0 0 0 0 0
# #
# Where "type_ids" are used to indicate whether this is the first # Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and # sequence or the second sequence. The embedding vectors for `type=0` and

View File

@@ -584,7 +584,7 @@ class BertModel(BertPreTrainedModel):
to the last attention block of shape [batch_size, sequence_length, hidden_size], to the last attention block of shape [batch_size, sequence_length, hidden_size],
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the classifier pretrained on top of the hidden state associated to the first character of the
input (`CLF`) to train on the Next-Sentence task (see BERT's paper). input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
Example usage: Example usage:
```python ```python