Output hidden states (#4978)
* Configure all models to use output_hidden_states as argument passed to foward() * Pass all tests * Remove cast_bool_to_primitive in TF Flaubert model * correct tf xlnet * add pytorch test * add tf test * Fix broken tests * Configure all models to use output_hidden_states as argument passed to foward() * Pass all tests * Remove cast_bool_to_primitive in TF Flaubert model * correct tf xlnet * add pytorch test * add tf test * Fix broken tests * Refactor output_hidden_states for mobilebert * Reset and remerge to master Co-authored-by: Joseph Liu <joseph.liu@coinflex.com> Co-authored-by: patrickvonplaten <patrick.v.platen@gmail.com>
This commit is contained in:
@@ -296,7 +296,6 @@ CTRL_INPUTS_DOCSTRING = r"""
|
||||
class CTRLModel(CTRLPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.output_hidden_states = config.output_hidden_states
|
||||
|
||||
self.d_model_size = config.n_embd
|
||||
self.num_layers = config.n_layer
|
||||
@@ -338,6 +337,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
inputs_embeds=None,
|
||||
use_cache=True,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
):
|
||||
r"""
|
||||
Return:
|
||||
@@ -347,7 +347,8 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
|
||||
when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
@@ -374,6 +375,9 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
||||
@@ -446,7 +450,7 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
all_hidden_states = ()
|
||||
all_attentions = []
|
||||
for i, (h, layer_past) in enumerate(zip(self.h, past)):
|
||||
if self.output_hidden_states:
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
|
||||
outputs = h(
|
||||
hidden_states,
|
||||
@@ -466,13 +470,13 @@ class CTRLModel(CTRLPreTrainedModel):
|
||||
|
||||
hidden_states = self.layernorm(hidden_states)
|
||||
hidden_states = hidden_states.view(*output_shape)
|
||||
if self.output_hidden_states:
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
||||
|
||||
outputs = (hidden_states,)
|
||||
if use_cache is True:
|
||||
outputs = outputs + (presents,)
|
||||
if self.output_hidden_states:
|
||||
if output_hidden_states:
|
||||
outputs = outputs + (all_hidden_states,)
|
||||
if output_attentions:
|
||||
# let the number of heads free (-1) so we can extract attention even after head pruning
|
||||
@@ -518,6 +522,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
labels=None,
|
||||
use_cache=True,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
@@ -536,7 +541,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
|
||||
Contains pre-computed hidden-states (key and values in the attention blocks).
|
||||
Can be used (see `past` input) to speed up sequential decoding.
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
|
||||
when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
@@ -571,6 +577,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
)
|
||||
|
||||
hidden_states = transformer_outputs[0]
|
||||
|
||||
Reference in New Issue
Block a user