Refactored Docstrings of BERT, GPT2, GPT, TransfoXL, XLM and XLNet.

2019-07-09 15:55:31 -04:00
parent ed6c8d37f4
commit 8fe2c9d98e
13 changed files with 924 additions and 763 deletions
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -127,7 +127,29 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}


 class OpenAIGPTConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `OpenAIGPTModel`.
+    """
+    Configuration class to store the configuration of a `OpenAIGPTModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        afn: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        predict_special_tokens: should we predict special tokens (when the model has a LM head)
    """
    pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP

@@ -157,27 +179,6 @@ class OpenAIGPTConfig(PretrainedConfig):
        **kwargs
    ):
        """Constructs OpenAIGPTConfig.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
-            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
-            n_positions: Number of positional embeddings.
-            n_ctx: Size of the causal mask (usually same as n_positions).
-            n_embd: Dimensionality of the embeddings and hidden states.
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            afn: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            resid_pdrop: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attn_pdrop: The dropout ratio for the attention
-                probabilities.
-            embd_pdrop: The dropout ratio for the embeddings.
-            layer_norm_epsilon: epsilon to use in the layer norm layers
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            predict_special_tokens: should we predict special tokens (when the model has a LM head)
        """
        super(OpenAIGPTConfig, self).__init__(**kwargs)

@@ -441,12 +442,16 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").

-    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+    OpenAI GPT uses a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained, such as: [SEP], [CLS]...
+
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controlled using the ``set_num_special_tokens(num_special_tokens)`` function.
+
+    The embeddings are ordered as follow in the token embeddings matrix:
+
+    ::

-    The embeddings are ordered as follow in the token embeddings matrice:
        [0,                                                         ----------------------
         ...                                                        -> word embeddings
         config.vocab_size - 1,                                     ______________________
@@ -454,44 +459,25 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         ...                                                        -> special embeddings
         config.vocab_size + config.n_special - 1]                  ______________________

-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
+    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:

-    Params:
+    ::
+
+        total_tokens_embeddings = config.vocab_size + config.n_special
+
+    You should use the associated indices to index the embeddings.
+
+    Args:
        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
            This can be used to compute head importance metrics. Default: False

-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

-    Outputs:
-        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
-            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
-            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+    Example::

-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-    config = modeling_openai.OpenAIGPTConfig()
-
-    model = modeling_openai.OpenAIGPTModel(config)
-    hidden_states = model(input_ids)
-    ```
+        config = modeling_openai.OpenAIGPTConfig()
+        model = modeling_openai.OpenAIGPTModel(config)
    """

    def __init__(self, config):
@@ -507,7 +493,17 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
        self.apply(self.init_weights)

    def set_num_special_tokens(self, num_special_tokens=None):
-        " Update input embeddings with new embedding matrice if needed "
+        """
+        Update input embeddings with new embedding matrice if needed
+
+        TODO
+
+        Args:
+            num_special_tokens:
+
+        Returns:
+
+        """
        if num_special_tokens is None or self.config.n_special == num_special_tokens:
            return
        # Update config
@@ -528,6 +524,37 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            self.h[layer].attn.prune_heads(heads)

    def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            ``hidden_states``, a list of all the encoded-hidden-states in the model (length of the list is number
+            of layers + 1 for the output of the embeddings)
+            as ``torch.FloatTensor`` of size [batch_size, sequence_length, hidden_size]
+            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+            hidden_states = model(input_ids)
+            # or
+            hidden_states = model.forward(input_ids)
+        """
        if position_ids is None:
            # This was used when we had a single embedding matrice from position and token embeddings
            # start = self.config.vocab_size + self.config.n_special
@@ -594,10 +621,13 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):

    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+    Special tokens need to be trained during the fine-tuning if you use them. The number of special embeddings
+    can be controlled using the ``set_num_special_tokens(num_special_tokens)`` function.
+
+    The embeddings are ordered as follow in the token embeddings matrix:
+
+    ::

-    The embeddings are ordered as follow in the token embeddings matrice:
        [0,                                                         ----------------------
         ...                                                        -> word embeddings
         config.vocab_size - 1,                                     ______________________
@@ -605,49 +635,25 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         ...                                                        -> special embeddings
         config.vocab_size + config.n_special - 1]                  ______________________

-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
+    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:

-    Params:
+    ::
+
+        total_tokens_embeddings = config.vocab_size + config.n_special
+
+    You should use the associated indices to index the embeddings.
+
+    Args:
        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
            This can be used to compute head importance metrics. Default: False

-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

-    Outputs:
-        if `lm_labels` is not `None`:
-            Outputs the language modeling loss.
-        else:
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings]
-                (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)
+    Example::

-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-    config = modeling_openai.OpenAIGPTConfig()
-
-    model = modeling_openai.OpenAIGPTLMHeadModel(config)
-    lm_logits = model(input_ids)
-    ```
+        config = modeling_openai.OpenAIGPTConfig()
+        model = modeling_openai.OpenAIGPTLMHeadModel(config)
    """

    def __init__(self, config):
@@ -657,14 +663,50 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        self.apply(self.init_weights)

    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrice
-            Make sure we are sharing the embeddings
+        """
+        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
+        TODO
+
        """
        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
        self.transformer.set_num_special_tokens(num_special_tokens)
        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)

    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
+                with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+                is only computed for the labels set in [0, ..., vocab_size]
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            if ``lm_labels`` is not ``None``, outputs the language modeling loss. Otherwise, outputs ``lm_logits``,
+            the language modeling logits as a ``torch.FloatTensor`` of size [batch_size, sequence_length,
+            total_tokens_embeddings] (or more generally [d_1, ..., d_n, total_tokens_embeddings] where d_1 ... d_n are
+            the dimension of input_ids)
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+            lm_logits = model(input_ids)
+            # or
+            lm_logits = model.forward(input_ids)
+        """
        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
        hidden_states = transformer_outputs[0]
        lm_logits = self.lm_head(hidden_states)
@@ -689,9 +731,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+    The number of special embeddings can be controlled using the ``set_num_special_tokens(num_special_tokens)``
+    function.
+
+    The embeddings are ordered as follow in the token embeddings matrix:
+
+    ::

-    The embeddings are ordered as follow in the token embeddings matrice:
        [0,                                                         ----------------------
         ...                                                        -> word embeddings
         config.vocab_size - 1,                                     ______________________
@@ -699,54 +745,24 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         ...                                                        -> special embeddings
         config.vocab_size + config.n_special - 1]                  ______________________

-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
+
+    ::
+
        total_tokens_embeddings = config.vocab_size + config.n_special
+
    You should use the associate indices to index the embeddings.

-    Params:
+    Args:
        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
            This can be used to compute head importance metrics. Default: False

-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
-            indices selected in the range [0, total_tokens_embeddings[
-        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
-            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., total_tokens_embeddings]
-        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::

-    Outputs:
-        if `lm_labels` and `multiple_choice_labels` are not `None`:
-            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
-        else: a tuple with
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
-            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
-    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
-
-    config = modeling_openai.OpenAIGPTConfig()
-
-    model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
-    lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
-    ```
+        config = modeling_openai.OpenAIGPTConfig()
+        model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
    """

    def __init__(self, config):
@@ -761,6 +777,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
        """ Update input and output embeddings with new embedding matrice
            Make sure we are sharing the embeddings
+            TODO
        """
        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
        self.transformer.set_num_special_tokens(num_special_tokens)
@@ -768,6 +785,50 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):

    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                position_ids=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with the BPE token
+                indices selected in the range [0, total_tokens_embeddings[
+            `mc_token_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices] with the index of the token from
+                which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
+                with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
+                is only computed for the labels set in [0, ..., total_tokens_embeddings]
+            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
+                with indices selected in [0, ..., num_choices].
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            if ``lm_labels`` and ``multiple_choice_labels`` are not ``None``, outputs a tuple of losses with the
+            language modeling loss and the multiple choice loss. Otherwise, returns a
+            ``tuple(lm_logits, multiple_choice_logits)``.
+
+                ``lm_logits`` are the language modeling logits as a ``torch.FloatTensor`` of size
+                [batch_size, num_choices, sequence_length, total_tokens_embeddings]
+
+                ``multiple_choice_logits``: the multiple choice logits as a ``torch.FloatTensor`` of
+                size [batch_size, num_choices]
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
+            mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
+
+            lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
+            # or
+            lm_logits, multiple_choice_logits = model.forward(input_ids, mc_token_ids)
+        """
        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
        hidden_states = transformer_outputs[0]