From 8722e9eb3bf68a03a3ea602812ec491b77646a1c Mon Sep 17 00:00:00 2001 From: Joel Grus Date: Sat, 23 Feb 2019 06:31:59 -0800 Subject: [PATCH] finish updating docstrings --- README.md | 12 ++++++------ pytorch_pretrained_bert/modeling_gpt2.py | 18 +++++++++++++++--- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index bbe0f42aab..0e3ead57e9 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ The repository further comprises: - [`run_classifier.py`](./examples/run_classifier.py) - Show how to fine-tune an instance of `BertForSequenceClassification` on GLUE's MRPC task, - [`run_squad.py`](./examples/run_squad.py) - Show how to fine-tune an instance of `BertForQuestionAnswering` on SQuAD v1.0 and SQuAD v2.0 tasks. - [`run_swag.py`](./examples/run_swag.py) - Show how to fine-tune an instance of `BertForMultipleChoice` on Swag task. - - [`run_lm_finetuning.py`](./examples/run_lm_finetuning.py) - Show how to fine-tune an instance of `BertForPretraining` on a target text corpus. + - [`run_lm_finetuning.py`](./examples/run_lm_finetuning.py) - Show how to fine-tune an instance of `BertForPretraining` on a target text corpus. - One example on how to use **OpenAI GPT** (in the [`examples` folder](./examples)): - [`run_openai_gpt.py`](./examples/run_openai_gpt.py) - Show how to fine-tune an instance of `OpenGPTDoubleHeadsModel` on the RocStories task. @@ -569,7 +569,7 @@ An example on how to use this class is given in the [`extract_features.py`](./ex - the masked language modeling logits, and - the next sentence classification logits. - + An example on how to use this class is given in the [`run_lm_finetuning.py`](./examples/run_lm_finetuning.py) script which can be used to fine-tune the BERT language model on your specific different text corpus. This should improve model performance, if the language style is different from the original BERT training corpus (Wiki + BookCorpus). @@ -773,7 +773,7 @@ This model *outputs*: *Outputs*: - if `lm_labels` is not `None`: Outputs the language modeling loss. -- else: a tupple of +- else: a tuple of - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings] (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids) - `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example). @@ -929,7 +929,7 @@ We showcase several fine-tuning examples based on (and extended from) [the origi - a *token-level classifier* on the question answering dataset SQuAD, and - a *sequence-level multiple-choice classifier* on the SWAG classification corpus. - a *BERT language model* on another target corpus - + #### MRPC This example code fine-tunes BERT on the Microsoft Research Paraphrase @@ -1045,7 +1045,7 @@ loss = 0.06423990014260186 #### LM Fine-tuning The data should be a text file in the same format as [sample_text.txt](./samples/sample_text.txt) (one sentence per line, docs separated by empty line). -You can download an [exemplary training corpus](https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt) generated from wikipedia articles and splitted into ~500k sentences with spaCy. +You can download an [exemplary training corpus](https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt) generated from wikipedia articles and splitted into ~500k sentences with spaCy. Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with `train_batch_size=200` and `max_seq_length=128`: @@ -1147,7 +1147,7 @@ python ./run_squad.py \ --doc_stride 128 \ --output_dir $OUTPUT_DIR \ --train_batch_size 24 \ - --gradient_accumulation_steps 2 + --gradient_accumulation_steps 2 ``` If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16). diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 2d21c54fa2..8795e9eeeb 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -492,12 +492,16 @@ class GPT2Model(GPT2PreTrainedModel): (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block. + `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states + (key and values in the attention blocks) to speed up sequential decoding + (this is the presents output of the model, cf. below). Outputs a tuple consisting of: `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids) - `presents`: ? + `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as + torch.FloatTensors. They can be reused to speed up sequential decoding. Example usage: ```python @@ -571,6 +575,9 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size] + `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states + (key and values in the attention blocks) to speed up sequential decoding + (this is the presents output of the model, cf. below). Outputs: if `lm_labels` is not `None`: @@ -578,7 +585,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): else a tuple: `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, config.vocab_size] (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ... d_n are the dimension of input_ids) - `presents`: ... + `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as + torch.FloatTensors. They can be reused to speed up sequential decoding. Example usage: ```python @@ -636,6 +644,9 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): is only computed for the labels set in [0, ..., config.vocab_size] `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] with indices selected in [0, ..., num_choices]. + `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states + (key and values in the attention blocks) to speed up sequential decoding + (this is the presents output of the model, cf. below). Outputs: if `lm_labels` and `multiple_choice_labels` are not `None`: @@ -643,7 +654,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): else: a tuple with `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, config.vocab_size] `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices] - `presents`: ... + `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as + torch.FloatTensors. They can be reused to speed up sequential decoding. Example usage: ```python