From 5288913bdd397bfe6e954ec7602d29f76dabc2a2 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 10 Jul 2019 15:16:40 -0400 Subject: [PATCH] All TODOs to be checked by Thom have been added. --- pytorch_transformers/modeling_gpt2.py | 27 ++++++++++--- pytorch_transformers/modeling_openai.py | 25 ++++++++---- pytorch_transformers/modeling_xlnet.py | 52 ++++++++++++++++--------- 3 files changed, 72 insertions(+), 32 deletions(-) diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py index ec2abf72b9..9ec5107b2a 100644 --- a/pytorch_transformers/modeling_gpt2.py +++ b/pytorch_transformers/modeling_gpt2.py @@ -483,7 +483,14 @@ class GPT2Model(GPT2PreTrainedModel): self.apply(self.init_weights) def set_num_special_tokens(self, num_special_tokens=None): - """Update input embeddings with new embedding matrix if needed.""" + """ + Update input embeddings with new embedding matrix if needed. + + Args: + num_special_tokens: Special tokens to be added to the embedding matrix + + TODO Lysandre filled args + """ if num_special_tokens is None or self.config.n_special == num_special_tokens: return # Update config @@ -625,8 +632,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False - - Example:: config = modeling_gpt2.GPT2Config() @@ -642,7 +647,13 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): """ Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings. - TODO Shouldn't we put args + returns ? + + Args: + num_special_tokens: Special tokens to be added to the embedding matrix + predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``. + Defaults to True. + + TODO Lysandre filled args """ self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens self.transformer.set_num_special_tokens(num_special_tokens) @@ -737,7 +748,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): """ Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings - TODO Shouldn't we put args + returns ? + + Args: + num_special_tokens: Special tokens to be added to the embedding matrix + predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``. + Defaults to True. + + TODO Lysandre filled args """ self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens self.transformer.set_num_special_tokens(num_special_tokens) diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py index 37736efed7..d5e8185c12 100644 --- a/pytorch_transformers/modeling_openai.py +++ b/pytorch_transformers/modeling_openai.py @@ -496,12 +496,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): """ Update input embeddings with new embedding matrice if needed - TODO - Args: - num_special_tokens: + num_special_tokens: Special tokens to be added to the embedding matrix - Returns: + TODO Lysandre filled Args """ if num_special_tokens is None or self.config.n_special == num_special_tokens: @@ -665,7 +663,13 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): """ Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings - TODO + + Args: + num_special_tokens: Special tokens to be added to the embedding matrix + predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``. + Defaults to True. + + TODO Lysandre filled Args """ self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens @@ -775,9 +779,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): self.apply(self.init_weights) def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): - """ Update input and output embeddings with new embedding matrice - Make sure we are sharing the embeddings - TODO + """ Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings. + + Args: + num_special_tokens: Special tokens to be added to the embedding matrix + predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``. + Defaults to True. + + TODO Lysandre filled Args """ self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens self.transformer.set_num_special_tokens(num_special_tokens) diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py index 5fee4e8524..7cef1b101b 100644 --- a/pytorch_transformers/modeling_xlnet.py +++ b/pytorch_transformers/modeling_xlnet.py @@ -623,7 +623,7 @@ class XLNetPreTrainedModel(PreTrainedModel): class XLNetModel(XLNetPreTrainedModel): """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding"). - TODO: this was copied from the XLNetLMHeadModel, check that it's ok. + TODO Lysandre filled: this was copied from the XLNetLMHeadModel, check that it's ok. Args: `config`: a XLNetConfig class instance with the configuration to build a new model @@ -631,7 +631,15 @@ class XLNetModel(XLNetPreTrainedModel): `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False - TODO: Add usage + + Example:: + + config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768, + n_layer=12, num_attention_heads=12, intermediate_size=3072) + + model = modeling.XLNetModel(config=config) + + TODO Lysandre filled: Added example usage """ def __init__(self, config): super(XLNetModel, self).__init__(config) @@ -663,8 +671,8 @@ class XLNetModel(XLNetPreTrainedModel): Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked. Args: - qlen: TODO - mlen: TODO + qlen: TODO Lysandre didn't fill + mlen: TODO Lysandre didn't fill :: @@ -783,19 +791,25 @@ class XLNetModel(XLNetPreTrainedModel): 1 for tokens with losses and 0 for tokens without losses. Only used during pretraining for two-stream attention. Set to None during finetuning. + head_mask: TODO Lysandre didn't fill - mem_len: int, the number of tokens to cache. - reuse_len: int, the number of tokens in the currect batch to be cached - and reused in the future. - bi_data: bool, whether to use bidirectional input pipeline. - Usually set to True during pretraining and False during finetuning. - clamp_len: int, clamp all relative distances larger than clamp_len. - -1 means no clamping. - same_length: bool, whether to use the same attention length for each token. - summary_type: str, "last", "first", "mean", or "attn". The method - to pool the input to get a vector representation. - TODO: Add usage + Returns: + TODO Lysandre didn't fill: Missing returns! + + Example:: + + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) + # or + all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask) + + TODO Lysandre filled: Filled with the LMHead example, is probably different since it has a different output + """ # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # but we want a unified interface in the library with the batch size on the first dimension @@ -951,14 +965,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. This can be used to compute head importance metrics. Default: False - - Example:: config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768, n_layer=12, num_attention_heads=12, intermediate_size=3072) - model = modeling.XLNetModel(config=config) + model = modeling.XLNetLMHeadModel(config=config) + + TODO Lysandre modified: Changed XLNetModel to XLNetLMHeadModel in the example """ def __init__(self, config): super(XLNetLMHeadModel, self).__init__(config) @@ -1122,7 +1136,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): 1 for tokens with losses and 0 for tokens without losses. Only used during pre-training for two-stream attention. Set to None during fine-tuning. - labels: TODO + labels: TODO Lysandre didn't fill head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.