XGLM: Fix left-padding (PT and TF) (#22828)
This commit is contained in:
@@ -124,18 +124,6 @@ def create_sinusoidal_positions(n_pos, dim, padding_idx=1):
|
|||||||
return jnp.array(emb)
|
return jnp.array(emb)
|
||||||
|
|
||||||
|
|
||||||
def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
|
|
||||||
"""
|
|
||||||
Shift input ids one token to the right.
|
|
||||||
"""
|
|
||||||
shifted_input_ids = jnp.roll(input_ids, 1, axis=-1)
|
|
||||||
shifted_input_ids = shifted_input_ids.at[(..., 0)].set(decoder_start_token_id)
|
|
||||||
# replace possible -100 values in labels by `pad_token_id`
|
|
||||||
shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
|
|
||||||
|
|
||||||
return shifted_input_ids
|
|
||||||
|
|
||||||
|
|
||||||
class FlaxXGLMAttention(nn.Module):
|
class FlaxXGLMAttention(nn.Module):
|
||||||
config: XGLMConfig
|
config: XGLMConfig
|
||||||
embed_dim: int
|
embed_dim: int
|
||||||
|
|||||||
@@ -476,19 +476,8 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
|
|||||||
|
|
||||||
return combined_attention_mask
|
return combined_attention_mask
|
||||||
|
|
||||||
def embed_positions(
|
def embed_positions(self, position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None) -> tf.Tensor:
|
||||||
self,
|
position_ids += self.offset
|
||||||
input_ids: Optional[TFModelInputType] = None,
|
|
||||||
inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
|
||||||
past_key_values_length: Optional[int] = None,
|
|
||||||
) -> tf.Tensor:
|
|
||||||
if input_ids is not None:
|
|
||||||
position_ids = _create_position_ids_from_input_ids(input_ids, past_key_values_length, self.padding_idx)
|
|
||||||
else:
|
|
||||||
position_ids = _create_position_ids_from_inputs_embeds(
|
|
||||||
inputs_embeds, past_key_values_length, self.padding_idx
|
|
||||||
)
|
|
||||||
|
|
||||||
positions = tf.gather(self._embed_positions_weights, position_ids, axis=0)
|
positions = tf.gather(self._embed_positions_weights, position_ids, axis=0)
|
||||||
return positions
|
return positions
|
||||||
|
|
||||||
@@ -497,6 +486,7 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
|
|||||||
self,
|
self,
|
||||||
input_ids: Optional[TFModelInputType] = None,
|
input_ids: Optional[TFModelInputType] = None,
|
||||||
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
|
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
@@ -528,9 +518,14 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
|
|||||||
else:
|
else:
|
||||||
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
# past_key_values_length
|
|
||||||
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
|
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
|
||||||
|
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = tf.expand_dims(
|
||||||
|
tf.range(past_key_values_length, input_shape[-1] + past_key_values_length), axis=0
|
||||||
|
)
|
||||||
|
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
|
||||||
|
|
||||||
if inputs_embeds is None:
|
if inputs_embeds is None:
|
||||||
# Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
|
# Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
|
||||||
# indices on GPU, returning zeros instead. This is a dangerous silent behavior.
|
# indices on GPU, returning zeros instead. This is a dangerous silent behavior.
|
||||||
@@ -552,7 +547,7 @@ class TFXGLMMainLayer(tf.keras.layers.Layer):
|
|||||||
encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1])
|
encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1])
|
||||||
|
|
||||||
# embed positions
|
# embed positions
|
||||||
positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length)
|
positions = self.embed_positions(position_ids)
|
||||||
|
|
||||||
hidden_states = tf.cast(inputs_embeds, dtype=tf.float32) + positions
|
hidden_states = tf.cast(inputs_embeds, dtype=tf.float32) + positions
|
||||||
|
|
||||||
@@ -713,6 +708,11 @@ XGLM_INPUTS_DOCSTRING = r"""
|
|||||||
- 0 for tokens that are **masked**.
|
- 0 for tokens that are **masked**.
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
[What are attention masks?](../glossary#attention-mask)
|
||||||
|
position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||||
|
config.max_position_embeddings - 1]`.
|
||||||
|
|
||||||
|
[What are position IDs?](../glossary#position-ids)
|
||||||
encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
||||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
|
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
|
||||||
the decoder.
|
the decoder.
|
||||||
@@ -796,6 +796,7 @@ class TFXGLMModel(TFXGLMPreTrainedModel):
|
|||||||
self,
|
self,
|
||||||
input_ids: Optional[TFModelInputType] = None,
|
input_ids: Optional[TFModelInputType] = None,
|
||||||
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
|
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
@@ -876,9 +877,6 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
|
|||||||
name="lm_head",
|
name="lm_head",
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO (Joao): investigate why XGLM has numerical issues in XLA generate
|
|
||||||
self.supports_xla_generation = False
|
|
||||||
|
|
||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.lm_head
|
return self.lm_head
|
||||||
|
|
||||||
@@ -890,11 +888,18 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
|
|||||||
if past_key_values:
|
if past_key_values:
|
||||||
inputs = tf.expand_dims(inputs[:, -1], -1)
|
inputs = tf.expand_dims(inputs[:, -1], -1)
|
||||||
|
|
||||||
|
position_ids = kwargs.get("position_ids", None)
|
||||||
attention_mask = kwargs.get("attention_mask", None)
|
attention_mask = kwargs.get("attention_mask", None)
|
||||||
|
|
||||||
|
if attention_mask is not None and position_ids is None:
|
||||||
|
position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
|
||||||
|
if past_key_values:
|
||||||
|
position_ids = tf.expand_dims(position_ids[:, -1], -1)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"input_ids": inputs,
|
"input_ids": inputs,
|
||||||
"attention_mask": attention_mask,
|
"attention_mask": attention_mask,
|
||||||
|
"position_ids": position_ids,
|
||||||
"past_key_values": past_key_values,
|
"past_key_values": past_key_values,
|
||||||
"use_cache": use_cache,
|
"use_cache": use_cache,
|
||||||
}
|
}
|
||||||
@@ -911,6 +916,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
|
|||||||
self,
|
self,
|
||||||
input_ids: Optional[TFModelInputType] = None,
|
input_ids: Optional[TFModelInputType] = None,
|
||||||
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
|
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
|
||||||
@@ -935,6 +941,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
|
|||||||
outputs = self.model(
|
outputs = self.model(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
encoder_hidden_states=encoder_hidden_states,
|
encoder_hidden_states=encoder_hidden_states,
|
||||||
encoder_attention_mask=encoder_attention_mask,
|
encoder_attention_mask=encoder_attention_mask,
|
||||||
head_mask=head_mask,
|
head_mask=head_mask,
|
||||||
|
|||||||
@@ -75,11 +75,34 @@ XGLM_INPUTS_DOCSTRING = r"""
|
|||||||
- 0 for tokens that are **masked**.
|
- 0 for tokens that are **masked**.
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
[What are attention masks?](../glossary#attention-mask)
|
||||||
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
|
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||||
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
|
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||||
|
config.max_position_embeddings - 1]`.
|
||||||
|
|
||||||
|
[What are position IDs?](../glossary#position-ids)
|
||||||
|
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
||||||
|
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
|
||||||
|
the decoder.
|
||||||
|
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
|
||||||
|
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
|
||||||
|
selected in `[0, 1]`:
|
||||||
|
|
||||||
|
- 1 for tokens that are **not masked**,
|
||||||
|
- 0 for tokens that are **masked**.
|
||||||
|
|
||||||
|
[What are attention masks?](../glossary#attention-mask)
|
||||||
|
head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
|
||||||
|
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
|
||||||
|
|
||||||
- 1 indicates the head is **not masked**,
|
- 1 indicates the head is **not masked**,
|
||||||
- 0 indicates the head is **masked**.
|
- 0 indicates the head is **masked**.
|
||||||
|
|
||||||
|
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
|
||||||
|
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
|
||||||
|
|
||||||
|
- 1 indicates the head is **not masked**,
|
||||||
|
- 0 indicates the head is **masked**.
|
||||||
|
|
||||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
||||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
|
||||||
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
|
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
|
||||||
@@ -88,20 +111,12 @@ XGLM_INPUTS_DOCSTRING = r"""
|
|||||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
||||||
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
|
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
|
||||||
|
|
||||||
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
|
||||||
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
|
||||||
of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
|
`decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape
|
||||||
sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to
|
`(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you
|
||||||
directly pass an embedded representation. This is useful if you want more control over how to convert
|
can choose to directly pass an embedded representation. This is useful if you want more control over how to
|
||||||
`input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
|
convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
|
||||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
|
|
||||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If
|
|
||||||
`past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
|
|
||||||
`past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into
|
|
||||||
associated vectors than the model's internal embedding lookup matrix.
|
|
||||||
use_cache (`bool`, *optional*):
|
|
||||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
|
|
||||||
`past_key_values`).
|
|
||||||
output_attentions (`bool`, *optional*):
|
output_attentions (`bool`, *optional*):
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||||
tensors for more detail.
|
tensors for more detail.
|
||||||
@@ -146,18 +161,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
|
|||||||
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
||||||
|
|
||||||
|
|
||||||
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
|
|
||||||
"""
|
|
||||||
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
|
|
||||||
are ignored. This is modified from fairseq's `utils.make_positions`.
|
|
||||||
"""
|
|
||||||
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
|
|
||||||
mask = input_ids.ne(padding_idx).int()
|
|
||||||
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
|
|
||||||
return incremental_indices.long() + padding_idx
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding with M2M100->XGLM
|
|
||||||
class XGLMSinusoidalPositionalEmbedding(nn.Module):
|
class XGLMSinusoidalPositionalEmbedding(nn.Module):
|
||||||
"""This module produces sinusoidal positional embeddings of any length."""
|
"""This module produces sinusoidal positional embeddings of any length."""
|
||||||
|
|
||||||
@@ -198,43 +201,17 @@ class XGLMSinusoidalPositionalEmbedding(nn.Module):
|
|||||||
return emb.to(torch.get_default_dtype())
|
return emb.to(torch.get_default_dtype())
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def forward(
|
def forward(self, position_ids: torch.Tensor = None, past_key_values_length: int = 0):
|
||||||
self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
|
bsz, seq_len = position_ids.size()
|
||||||
):
|
position_ids += self.offset
|
||||||
if input_ids is not None:
|
|
||||||
bsz, seq_len = input_ids.size()
|
|
||||||
# Create the position ids from the input token ids. Any padded tokens remain padded.
|
|
||||||
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
|
|
||||||
input_ids.device
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
bsz, seq_len = inputs_embeds.size()[:-1]
|
|
||||||
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
|
|
||||||
|
|
||||||
# expand embeddings if needed
|
# Expand embeddings if needed. `position_ids.max()` is NOT used to keep torch.fx compatibility.
|
||||||
max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
|
max_pos = 2 + seq_len + past_key_values_length
|
||||||
if max_pos > self.weights.size(0):
|
if max_pos > self.weights.size(0):
|
||||||
self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
|
self.make_weights(max_pos, self.embedding_dim, self.padding_idx)
|
||||||
|
|
||||||
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
|
return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
|
||||||
|
|
||||||
def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
|
|
||||||
"""
|
|
||||||
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
inputs_embeds: torch.Tensor
|
|
||||||
|
|
||||||
Returns: torch.Tensor
|
|
||||||
"""
|
|
||||||
input_shape = inputs_embeds.size()[:-1]
|
|
||||||
sequence_length = input_shape[1]
|
|
||||||
|
|
||||||
position_ids = torch.arange(
|
|
||||||
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
|
|
||||||
)
|
|
||||||
return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
|
|
||||||
|
|
||||||
|
|
||||||
class XGLMAttention(nn.Module):
|
class XGLMAttention(nn.Module):
|
||||||
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||||
@@ -605,6 +582,7 @@ class XGLMModel(XGLMPreTrainedModel):
|
|||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.Tensor] = None,
|
input_ids: Optional[torch.Tensor] = None,
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.Tensor] = None,
|
||||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||||
encoder_attention_mask: Optional[torch.Tensor] = None,
|
encoder_attention_mask: Optional[torch.Tensor] = None,
|
||||||
head_mask: Optional[torch.Tensor] = None,
|
head_mask: Optional[torch.Tensor] = None,
|
||||||
@@ -616,70 +594,6 @@ class XGLMModel(XGLMPreTrainedModel):
|
|||||||
output_hidden_states: Optional[bool] = None,
|
output_hidden_states: Optional[bool] = None,
|
||||||
return_dict: Optional[bool] = None,
|
return_dict: Optional[bool] = None,
|
||||||
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
|
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
|
||||||
r"""
|
|
||||||
Args:
|
|
||||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
|
||||||
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
|
|
||||||
provide it.
|
|
||||||
|
|
||||||
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
|
||||||
[`PreTrainedTokenizer.__call__`] for details.
|
|
||||||
|
|
||||||
[What are input IDs?](../glossary#input-ids)
|
|
||||||
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
||||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
|
||||||
- 0 for tokens that are **masked**.
|
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
|
||||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
|
|
||||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
|
|
||||||
of the decoder.
|
|
||||||
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
|
|
||||||
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
|
|
||||||
selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
|
||||||
- 0 for tokens that are **masked**.
|
|
||||||
|
|
||||||
[What are attention masks?](../glossary#attention-mask)
|
|
||||||
head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
|
|
||||||
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 indicates the head is **not masked**,
|
|
||||||
- 0 indicates the head is **masked**.
|
|
||||||
|
|
||||||
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
|
|
||||||
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
|
|
||||||
|
|
||||||
- 1 indicates the head is **not masked**,
|
|
||||||
- 0 indicates the head is **masked**.
|
|
||||||
|
|
||||||
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
|
|
||||||
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
|
||||||
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
|
|
||||||
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
|
|
||||||
|
|
||||||
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
|
|
||||||
cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
|
|
||||||
|
|
||||||
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
|
|
||||||
that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
|
|
||||||
all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
|
|
||||||
shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
|
|
||||||
`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
|
|
||||||
control over how to convert `input_ids` indices into associated vectors than the model's internal
|
|
||||||
embedding lookup matrix.
|
|
||||||
output_attentions (`bool`, *optional*):
|
|
||||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
|
||||||
returned tensors for more detail.
|
|
||||||
output_hidden_states (`bool`, *optional*):
|
|
||||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
|
|
||||||
for more detail.
|
|
||||||
return_dict (`bool`, *optional*):
|
|
||||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
|
||||||
"""
|
|
||||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
output_hidden_states = (
|
output_hidden_states = (
|
||||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
@@ -698,9 +612,19 @@ class XGLMModel(XGLMPreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
||||||
|
|
||||||
# past_key_values_length
|
|
||||||
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
|
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
|
||||||
|
|
||||||
|
if position_ids is None:
|
||||||
|
position_ids = torch.arange(
|
||||||
|
past_key_values_length,
|
||||||
|
input_shape[-1] + past_key_values_length,
|
||||||
|
dtype=torch.long,
|
||||||
|
device=input_ids.device if input_ids is not None else inputs_embeds.device,
|
||||||
|
)
|
||||||
|
position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
|
||||||
|
else:
|
||||||
|
position_ids = position_ids.view(-1, input_shape[-1])
|
||||||
|
|
||||||
if inputs_embeds is None:
|
if inputs_embeds is None:
|
||||||
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
|
inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
|
||||||
|
|
||||||
@@ -713,11 +637,7 @@ class XGLMModel(XGLMPreTrainedModel):
|
|||||||
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
||||||
encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
|
encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
|
||||||
|
|
||||||
# embed positions
|
hidden_states = inputs_embeds + self.embed_positions(position_ids, past_key_values_length)
|
||||||
positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length)
|
|
||||||
|
|
||||||
hidden_states = inputs_embeds + positions
|
|
||||||
|
|
||||||
hidden_states = nn.functional.dropout(hidden_states, p=float(self.dropout), training=self.training)
|
hidden_states = nn.functional.dropout(hidden_states, p=float(self.dropout), training=self.training)
|
||||||
|
|
||||||
if self.gradient_checkpointing and self.training:
|
if self.gradient_checkpointing and self.training:
|
||||||
@@ -866,6 +786,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
|
|||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.Tensor] = None,
|
input_ids: Optional[torch.Tensor] = None,
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_ids: Optional[torch.Tensor] = None,
|
||||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||||
encoder_attention_mask: Optional[torch.Tensor] = None,
|
encoder_attention_mask: Optional[torch.Tensor] = None,
|
||||||
head_mask: Optional[torch.Tensor] = None,
|
head_mask: Optional[torch.Tensor] = None,
|
||||||
@@ -895,6 +816,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
|
|||||||
outputs = self.model(
|
outputs = self.model(
|
||||||
input_ids=input_ids,
|
input_ids=input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
encoder_hidden_states=encoder_hidden_states,
|
encoder_hidden_states=encoder_hidden_states,
|
||||||
encoder_attention_mask=encoder_attention_mask,
|
encoder_attention_mask=encoder_attention_mask,
|
||||||
head_mask=head_mask,
|
head_mask=head_mask,
|
||||||
@@ -935,9 +857,18 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
|
|||||||
def prepare_inputs_for_generation(
|
def prepare_inputs_for_generation(
|
||||||
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
|
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
|
||||||
):
|
):
|
||||||
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
|
position_ids = kwargs.get("position_ids", None)
|
||||||
if attention_mask is None:
|
if attention_mask is not None and position_ids is None:
|
||||||
attention_mask = input_ids.new_ones(input_ids.shape)
|
# create position_ids on the fly for batch generation
|
||||||
|
position_ids = attention_mask.long().cumsum(-1) - 1
|
||||||
|
position_ids.masked_fill_(attention_mask == 0, 1)
|
||||||
|
if past_key_values:
|
||||||
|
position_ids = position_ids[:, -1].unsqueeze(-1)
|
||||||
|
else:
|
||||||
|
position_ids = None
|
||||||
|
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
|
||||||
|
if attention_mask is None:
|
||||||
|
attention_mask = input_ids.new_ones(input_ids.shape)
|
||||||
|
|
||||||
if past_key_values:
|
if past_key_values:
|
||||||
input_ids = input_ids[:, -1:]
|
input_ids = input_ids[:, -1:]
|
||||||
@@ -945,6 +876,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel):
|
|||||||
return {
|
return {
|
||||||
"input_ids": input_ids, # encoder_outputs is defined. input_ids not needed
|
"input_ids": input_ids, # encoder_outputs is defined. input_ids not needed
|
||||||
"attention_mask": attention_mask,
|
"attention_mask": attention_mask,
|
||||||
|
"position_ids": position_ids,
|
||||||
"past_key_values": past_key_values,
|
"past_key_values": past_key_values,
|
||||||
"use_cache": use_cache,
|
"use_cache": use_cache,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -175,44 +175,6 @@ class TFXGLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
|||||||
name = model.get_bias()
|
name = model.get_bias()
|
||||||
assert name is None
|
assert name is None
|
||||||
|
|
||||||
@slow
|
|
||||||
def test_batch_generation(self):
|
|
||||||
model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
|
|
||||||
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
|
|
||||||
|
|
||||||
tokenizer.padding_side = "left"
|
|
||||||
|
|
||||||
# use different length sentences to test batching
|
|
||||||
sentences = [
|
|
||||||
"Hello, my dog is a little",
|
|
||||||
"Today, I",
|
|
||||||
]
|
|
||||||
|
|
||||||
inputs = tokenizer(sentences, return_tensors="tf", padding=True)
|
|
||||||
|
|
||||||
outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
|
|
||||||
|
|
||||||
inputs_non_padded = tokenizer(sentences[0], return_tensors="tf").input_ids
|
|
||||||
output_non_padded = model.generate(input_ids=inputs_non_padded)
|
|
||||||
|
|
||||||
num_paddings = (
|
|
||||||
inputs_non_padded.shape[-1]
|
|
||||||
- tf.math.reduce_sum(tf.cast(inputs["attention_mask"][-1], dtype=tf.int64)).numpy()
|
|
||||||
)
|
|
||||||
inputs_padded = tokenizer(sentences[1], return_tensors="tf").input_ids
|
|
||||||
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
|
|
||||||
|
|
||||||
batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
|
||||||
non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
|
|
||||||
padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
|
|
||||||
|
|
||||||
expected_output_sentence = [
|
|
||||||
"Hello, my dog is a little bit of a shy one, but he is very friendly",
|
|
||||||
"Today, I am going to share with you a few of my favorite things",
|
|
||||||
]
|
|
||||||
self.assertListEqual(expected_output_sentence, batch_out_sentence)
|
|
||||||
self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
|
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
for model_name in TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
for model_name in TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||||
@@ -246,7 +208,9 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
|
|||||||
tf.random.set_seed(0)
|
tf.random.set_seed(0)
|
||||||
tokenized = tokenizer("Today is a nice day and", return_tensors="tf")
|
tokenized = tokenizer("Today is a nice day and", return_tensors="tf")
|
||||||
input_ids = tokenized.input_ids
|
input_ids = tokenized.input_ids
|
||||||
output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0])
|
# forces the generation to happen on CPU, to avoid GPU-related quirks (and assure same output regardless of the available devices)
|
||||||
|
with tf.device(":/CPU:0"):
|
||||||
|
output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0])
|
||||||
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
||||||
|
|
||||||
EXPECTED_OUTPUT_STR = (
|
EXPECTED_OUTPUT_STR = (
|
||||||
@@ -255,33 +219,41 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase):
|
|||||||
self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
|
self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_lm_generate_xglm_left_padding(self):
|
def test_batch_generation(self):
|
||||||
"""Tests that the generated text is the same, regarless of left padding"""
|
|
||||||
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
|
|
||||||
model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
|
model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M")
|
||||||
|
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
|
||||||
|
|
||||||
tokenizer.padding_side = "left"
|
tokenizer.padding_side = "left"
|
||||||
|
|
||||||
generation_kwargs = {
|
# use different length sentences to test batching
|
||||||
"bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids],
|
sentences = [
|
||||||
"no_repeat_ngram_size": 2,
|
"This is an extremelly long sentence that only exists to test the ability of the model to cope with "
|
||||||
"do_sample": False,
|
"left-padding, such as in batched generation. The output for the sequence below should be the same "
|
||||||
"repetition_penalty": 1.3,
|
"regardless of whether left padding is applied or not. When",
|
||||||
}
|
"Hello, my dog is a little",
|
||||||
expected_output_string = (
|
]
|
||||||
"Today is a beautiful day and I am so glad that we have the opportunity to spend time with"
|
|
||||||
)
|
|
||||||
|
|
||||||
sentences = ["Today is a beautiful day and"]
|
inputs = tokenizer(sentences, return_tensors="tf", padding=True)
|
||||||
input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
|
input_ids = inputs["input_ids"]
|
||||||
# using default length
|
|
||||||
output_ids = model.generate(**input_ids, **generation_kwargs)
|
|
||||||
output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
|
||||||
self.assertEqual(output_strings[0], expected_output_string)
|
|
||||||
|
|
||||||
sentences = ["Today is a beautiful day and", "This is a very long input that we absolutely don't care about"]
|
outputs = model.generate(input_ids=input_ids, attention_mask=inputs["attention_mask"], max_new_tokens=12)
|
||||||
input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
|
|
||||||
# longer max length to capture the full length (remember: it is left padded)
|
inputs_non_padded = tokenizer(sentences[0], return_tensors="tf").input_ids
|
||||||
output_ids = model.generate(**input_ids, **generation_kwargs, max_length=28)
|
output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=12)
|
||||||
output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
|
||||||
self.assertEqual(output_strings[0], expected_output_string)
|
inputs_padded = tokenizer(sentences[1], return_tensors="tf").input_ids
|
||||||
|
output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=12)
|
||||||
|
|
||||||
|
batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||||
|
non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
|
||||||
|
padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
|
||||||
|
|
||||||
|
expected_output_sentence = [
|
||||||
|
"This is an extremelly long sentence that only exists to test the ability of the model to cope with "
|
||||||
|
"left-padding, such as in batched generation. The output for the sequence below should be the same "
|
||||||
|
"regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
|
||||||
|
"a single",
|
||||||
|
"Hello, my dog is a little bit of a shy one, but he is very friendly",
|
||||||
|
]
|
||||||
|
self.assertListEqual(expected_output_sentence, batch_out_sentence)
|
||||||
|
self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
|
||||||
|
|||||||
@@ -340,46 +340,6 @@ class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
|
|||||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||||
self.model_tester.create_and_check_xglm_weight_initialization(*config_and_inputs)
|
self.model_tester.create_and_check_xglm_weight_initialization(*config_and_inputs)
|
||||||
|
|
||||||
@slow
|
|
||||||
def test_batch_generation(self):
|
|
||||||
model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
|
|
||||||
model.to(torch_device)
|
|
||||||
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
|
|
||||||
|
|
||||||
tokenizer.padding_side = "left"
|
|
||||||
|
|
||||||
# use different length sentences to test batching
|
|
||||||
sentences = [
|
|
||||||
"Hello, my dog is a little",
|
|
||||||
"Today, I",
|
|
||||||
]
|
|
||||||
|
|
||||||
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
|
|
||||||
input_ids = inputs["input_ids"].to(torch_device)
|
|
||||||
|
|
||||||
outputs = model.generate(
|
|
||||||
input_ids=input_ids,
|
|
||||||
attention_mask=inputs["attention_mask"].to(torch_device),
|
|
||||||
)
|
|
||||||
|
|
||||||
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
|
|
||||||
output_non_padded = model.generate(input_ids=inputs_non_padded)
|
|
||||||
|
|
||||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
|
|
||||||
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
|
|
||||||
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
|
|
||||||
|
|
||||||
batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
|
||||||
non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
|
|
||||||
padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
|
|
||||||
|
|
||||||
expected_output_sentence = [
|
|
||||||
"Hello, my dog is a little bit of a shy one, but he is very friendly",
|
|
||||||
"Today, I am going to share with you a few of my favorite things",
|
|
||||||
]
|
|
||||||
self.assertListEqual(expected_output_sentence, batch_out_sentence)
|
|
||||||
self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
|
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_model_from_pretrained(self):
|
def test_model_from_pretrained(self):
|
||||||
for model_name in XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
for model_name in XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||||
@@ -409,6 +369,49 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase):
|
|||||||
if verify_outputs:
|
if verify_outputs:
|
||||||
self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
|
self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_batch_generation(self):
|
||||||
|
model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M")
|
||||||
|
model.to(torch_device)
|
||||||
|
tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M")
|
||||||
|
|
||||||
|
tokenizer.padding_side = "left"
|
||||||
|
|
||||||
|
# use different length sentences to test batching
|
||||||
|
sentences = [
|
||||||
|
"This is an extremelly long sentence that only exists to test the ability of the model to cope with "
|
||||||
|
"left-padding, such as in batched generation. The output for the sequence below should be the same "
|
||||||
|
"regardless of whether left padding is applied or not. When",
|
||||||
|
"Hello, my dog is a little",
|
||||||
|
]
|
||||||
|
|
||||||
|
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
|
||||||
|
input_ids = inputs["input_ids"].to(torch_device)
|
||||||
|
|
||||||
|
outputs = model.generate(
|
||||||
|
input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), max_new_tokens=12
|
||||||
|
)
|
||||||
|
|
||||||
|
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
|
||||||
|
output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=12)
|
||||||
|
|
||||||
|
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
|
||||||
|
output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=12)
|
||||||
|
|
||||||
|
batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
||||||
|
non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
|
||||||
|
padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
|
||||||
|
|
||||||
|
expected_output_sentence = [
|
||||||
|
"This is an extremelly long sentence that only exists to test the ability of the model to cope with "
|
||||||
|
"left-padding, such as in batched generation. The output for the sequence below should be the same "
|
||||||
|
"regardless of whether left padding is applied or not. When left padding is applied, the sequence will be "
|
||||||
|
"a single",
|
||||||
|
"Hello, my dog is a little bit of a shy one, but he is very friendly",
|
||||||
|
]
|
||||||
|
self.assertListEqual(expected_output_sentence, batch_out_sentence)
|
||||||
|
self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_lm_generate_xglm(self):
|
def test_lm_generate_xglm(self):
|
||||||
self._test_lm_generate_xglm_helper()
|
self._test_lm_generate_xglm_helper()
|
||||||
|
|||||||
Reference in New Issue
Block a user