From 4060d6857e12b8ad76e7562cf5146263a04f23ca Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Thu, 20 Apr 2023 10:01:56 +0100 Subject: [PATCH] XGLM: Fix left-padding (PT and TF) (#22828) --- .../models/xglm/modeling_flax_xglm.py | 12 -- .../models/xglm/modeling_tf_xglm.py | 43 ++-- src/transformers/models/xglm/modeling_xglm.py | 198 ++++++------------ tests/models/xglm/test_modeling_tf_xglm.py | 98 ++++----- tests/models/xglm/test_modeling_xglm.py | 83 ++++---- 5 files changed, 168 insertions(+), 266 deletions(-) diff --git a/src/transformers/models/xglm/modeling_flax_xglm.py b/src/transformers/models/xglm/modeling_flax_xglm.py index 5a2a8951f1..b2acd66f44 100644 --- a/src/transformers/models/xglm/modeling_flax_xglm.py +++ b/src/transformers/models/xglm/modeling_flax_xglm.py @@ -124,18 +124,6 @@ def create_sinusoidal_positions(n_pos, dim, padding_idx=1): return jnp.array(emb) -def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray: - """ - Shift input ids one token to the right. - """ - shifted_input_ids = jnp.roll(input_ids, 1, axis=-1) - shifted_input_ids = shifted_input_ids.at[(..., 0)].set(decoder_start_token_id) - # replace possible -100 values in labels by `pad_token_id` - shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids) - - return shifted_input_ids - - class FlaxXGLMAttention(nn.Module): config: XGLMConfig embed_dim: int diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py index c07bafe240..d112e641a9 100644 --- a/src/transformers/models/xglm/modeling_tf_xglm.py +++ b/src/transformers/models/xglm/modeling_tf_xglm.py @@ -476,19 +476,8 @@ class TFXGLMMainLayer(tf.keras.layers.Layer): return combined_attention_mask - def embed_positions( - self, - input_ids: Optional[TFModelInputType] = None, - inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, - past_key_values_length: Optional[int] = None, - ) -> tf.Tensor: - if input_ids is not None: - position_ids = _create_position_ids_from_input_ids(input_ids, past_key_values_length, self.padding_idx) - else: - position_ids = _create_position_ids_from_inputs_embeds( - inputs_embeds, past_key_values_length, self.padding_idx - ) - + def embed_positions(self, position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None) -> tf.Tensor: + position_ids += self.offset positions = tf.gather(self._embed_positions_weights, position_ids, axis=0) return positions @@ -497,6 +486,7 @@ class TFXGLMMainLayer(tf.keras.layers.Layer): self, input_ids: Optional[TFModelInputType] = None, attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, @@ -528,9 +518,14 @@ class TFXGLMMainLayer(tf.keras.layers.Layer): else: raise ValueError("You have to specify either input_ids or inputs_embeds") - # past_key_values_length past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + if position_ids is None: + position_ids = tf.expand_dims( + tf.range(past_key_values_length, input_shape[-1] + past_key_values_length), axis=0 + ) + position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) + if inputs_embeds is None: # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound # indices on GPU, returning zeros instead. This is a dangerous silent behavior. @@ -552,7 +547,7 @@ class TFXGLMMainLayer(tf.keras.layers.Layer): encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1]) # embed positions - positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length) + positions = self.embed_positions(position_ids) hidden_states = tf.cast(inputs_embeds, dtype=tf.float32) + positions @@ -713,6 +708,11 @@ XGLM_INPUTS_DOCSTRING = r""" - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. @@ -796,6 +796,7 @@ class TFXGLMModel(TFXGLMPreTrainedModel): self, input_ids: Optional[TFModelInputType] = None, attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, @@ -876,9 +877,6 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): name="lm_head", ) - # TODO (Joao): investigate why XGLM has numerical issues in XLA generate - self.supports_xla_generation = False - def get_output_embeddings(self): return self.lm_head @@ -890,11 +888,18 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): if past_key_values: inputs = tf.expand_dims(inputs[:, -1], -1) + position_ids = kwargs.get("position_ids", None) attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None and position_ids is None: + position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True) + if past_key_values: + position_ids = tf.expand_dims(position_ids[:, -1], -1) + return { "input_ids": inputs, "attention_mask": attention_mask, + "position_ids": position_ids, "past_key_values": past_key_values, "use_cache": use_cache, } @@ -911,6 +916,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): self, input_ids: Optional[TFModelInputType] = None, attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None, encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, @@ -935,6 +941,7 @@ class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, + position_ids=position_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, head_mask=head_mask, diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py index 64cec28edf..3cf16352a7 100755 --- a/src/transformers/models/xglm/modeling_xglm.py +++ b/src/transformers/models/xglm/modeling_xglm.py @@ -75,11 +75,34 @@ XGLM_INPUTS_DOCSTRING = r""" - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) - head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of + the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape @@ -88,20 +111,12 @@ XGLM_INPUTS_DOCSTRING = r""" Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, - sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to - directly pass an embedded representation. This is useful if you want more control over how to convert - `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If - `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see - `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into - associated vectors than the model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape + `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you + can choose to directly pass an embedded representation. This is useful if you want more control over how to + convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. @@ -146,18 +161,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) -def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): - """ - Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols - are ignored. This is modified from fairseq's `utils.make_positions`. - """ - # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. - mask = input_ids.ne(padding_idx).int() - incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask - return incremental_indices.long() + padding_idx - - -# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding with M2M100->XGLM class XGLMSinusoidalPositionalEmbedding(nn.Module): """This module produces sinusoidal positional embeddings of any length.""" @@ -198,43 +201,17 @@ class XGLMSinusoidalPositionalEmbedding(nn.Module): return emb.to(torch.get_default_dtype()) @torch.no_grad() - def forward( - self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0 - ): - if input_ids is not None: - bsz, seq_len = input_ids.size() - # Create the position ids from the input token ids. Any padded tokens remain padded. - position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to( - input_ids.device - ) - else: - bsz, seq_len = inputs_embeds.size()[:-1] - position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length) + def forward(self, position_ids: torch.Tensor = None, past_key_values_length: int = 0): + bsz, seq_len = position_ids.size() + position_ids += self.offset - # expand embeddings if needed - max_pos = self.padding_idx + 1 + seq_len + past_key_values_length + # Expand embeddings if needed. `position_ids.max()` is NOT used to keep torch.fx compatibility. + max_pos = 2 + seq_len + past_key_values_length if max_pos > self.weights.size(0): - self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx) + self.make_weights(max_pos, self.embedding_dim, self.padding_idx) return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach() - def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length): - """ - We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. - - Args: - inputs_embeds: torch.Tensor - - Returns: torch.Tensor - """ - input_shape = inputs_embeds.size()[:-1] - sequence_length = input_shape[1] - - position_ids = torch.arange( - self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device - ) - return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length - class XGLMAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -605,6 +582,7 @@ class XGLMModel(XGLMPreTrainedModel): self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, @@ -616,70 +594,6 @@ class XGLMModel(XGLMPreTrainedModel): output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: - r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you - provide it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention - of the decoder. - encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): - Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values - selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*): - Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*): - Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of - shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. - - Contains pre-computed hidden-states (key and values in the self-attention blocks and in the - cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those - that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of - all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of - shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing - `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more - control over how to convert `input_ids` indices into associated vectors than the model's internal - embedding lookup matrix. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -698,9 +612,19 @@ class XGLMModel(XGLMPreTrainedModel): else: raise ValueError("You have to specify either input_ids or inputs_embeds") - # past_key_values_length past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + if position_ids is None: + position_ids = torch.arange( + past_key_values_length, + input_shape[-1] + past_key_values_length, + dtype=torch.long, + device=input_ids.device if input_ids is not None else inputs_embeds.device, + ) + position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) + else: + position_ids = position_ids.view(-1, input_shape[-1]) + if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale @@ -713,11 +637,7 @@ class XGLMModel(XGLMPreTrainedModel): # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) - # embed positions - positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length) - - hidden_states = inputs_embeds + positions - + hidden_states = inputs_embeds + self.embed_positions(position_ids, past_key_values_length) hidden_states = nn.functional.dropout(hidden_states, p=float(self.dropout), training=self.training) if self.gradient_checkpointing and self.training: @@ -866,6 +786,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel): self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, @@ -895,6 +816,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel): outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, + position_ids=position_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, head_mask=head_mask, @@ -935,9 +857,18 @@ class XGLMForCausalLM(XGLMPreTrainedModel): def prepare_inputs_for_generation( self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs ): - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - if attention_mask is None: - attention_mask = input_ids.new_ones(input_ids.shape) + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + else: + position_ids = None + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) if past_key_values: input_ids = input_ids[:, -1:] @@ -945,6 +876,7 @@ class XGLMForCausalLM(XGLMPreTrainedModel): return { "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed "attention_mask": attention_mask, + "position_ids": position_ids, "past_key_values": past_key_values, "use_cache": use_cache, } diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py index c5baeb510e..61fd805725 100644 --- a/tests/models/xglm/test_modeling_tf_xglm.py +++ b/tests/models/xglm/test_modeling_tf_xglm.py @@ -175,44 +175,6 @@ class TFXGLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase name = model.get_bias() assert name is None - @slow - def test_batch_generation(self): - model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M") - tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M") - - tokenizer.padding_side = "left" - - # use different length sentences to test batching - sentences = [ - "Hello, my dog is a little", - "Today, I", - ] - - inputs = tokenizer(sentences, return_tensors="tf", padding=True) - - outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]) - - inputs_non_padded = tokenizer(sentences[0], return_tensors="tf").input_ids - output_non_padded = model.generate(input_ids=inputs_non_padded) - - num_paddings = ( - inputs_non_padded.shape[-1] - - tf.math.reduce_sum(tf.cast(inputs["attention_mask"][-1], dtype=tf.int64)).numpy() - ) - inputs_padded = tokenizer(sentences[1], return_tensors="tf").input_ids - output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) - - batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) - non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) - padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) - - expected_output_sentence = [ - "Hello, my dog is a little bit of a shy one, but he is very friendly", - "Today, I am going to share with you a few of my favorite things", - ] - self.assertListEqual(expected_output_sentence, batch_out_sentence) - self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) - @slow def test_model_from_pretrained(self): for model_name in TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: @@ -246,7 +208,9 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase): tf.random.set_seed(0) tokenized = tokenizer("Today is a nice day and", return_tensors="tf") input_ids = tokenized.input_ids - output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0]) + # forces the generation to happen on CPU, to avoid GPU-related quirks (and assure same output regardless of the available devices) + with tf.device(":/CPU:0"): + output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0]) output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True) EXPECTED_OUTPUT_STR = ( @@ -255,33 +219,41 @@ class TFXGLMModelLanguageGenerationTest(unittest.TestCase): self.assertEqual(output_str, EXPECTED_OUTPUT_STR) @slow - def test_lm_generate_xglm_left_padding(self): - """Tests that the generated text is the same, regarless of left padding""" - tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M") + def test_batch_generation(self): model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M") + tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M") tokenizer.padding_side = "left" - generation_kwargs = { - "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], - "no_repeat_ngram_size": 2, - "do_sample": False, - "repetition_penalty": 1.3, - } - expected_output_string = ( - "Today is a beautiful day and I am so glad that we have the opportunity to spend time with" - ) + # use different length sentences to test batching + sentences = [ + "This is an extremelly long sentence that only exists to test the ability of the model to cope with " + "left-padding, such as in batched generation. The output for the sequence below should be the same " + "regardless of whether left padding is applied or not. When", + "Hello, my dog is a little", + ] - sentences = ["Today is a beautiful day and"] - input_ids = tokenizer(sentences, return_tensors="tf", padding=True) - # using default length - output_ids = model.generate(**input_ids, **generation_kwargs) - output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - self.assertEqual(output_strings[0], expected_output_string) + inputs = tokenizer(sentences, return_tensors="tf", padding=True) + input_ids = inputs["input_ids"] - sentences = ["Today is a beautiful day and", "This is a very long input that we absolutely don't care about"] - input_ids = tokenizer(sentences, return_tensors="tf", padding=True) - # longer max length to capture the full length (remember: it is left padded) - output_ids = model.generate(**input_ids, **generation_kwargs, max_length=28) - output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - self.assertEqual(output_strings[0], expected_output_string) + outputs = model.generate(input_ids=input_ids, attention_mask=inputs["attention_mask"], max_new_tokens=12) + + inputs_non_padded = tokenizer(sentences[0], return_tensors="tf").input_ids + output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=12) + + inputs_padded = tokenizer(sentences[1], return_tensors="tf").input_ids + output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=12) + + batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) + non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) + padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) + + expected_output_sentence = [ + "This is an extremelly long sentence that only exists to test the ability of the model to cope with " + "left-padding, such as in batched generation. The output for the sequence below should be the same " + "regardless of whether left padding is applied or not. When left padding is applied, the sequence will be " + "a single", + "Hello, my dog is a little bit of a shy one, but he is very friendly", + ] + self.assertListEqual(expected_output_sentence, batch_out_sentence) + self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py index 9fcc25b6d2..5028c30ea9 100644 --- a/tests/models/xglm/test_modeling_xglm.py +++ b/tests/models/xglm/test_modeling_xglm.py @@ -340,46 +340,6 @@ class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xglm_weight_initialization(*config_and_inputs) - @slow - def test_batch_generation(self): - model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M") - model.to(torch_device) - tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M") - - tokenizer.padding_side = "left" - - # use different length sentences to test batching - sentences = [ - "Hello, my dog is a little", - "Today, I", - ] - - inputs = tokenizer(sentences, return_tensors="pt", padding=True) - input_ids = inputs["input_ids"].to(torch_device) - - outputs = model.generate( - input_ids=input_ids, - attention_mask=inputs["attention_mask"].to(torch_device), - ) - - inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device) - output_non_padded = model.generate(input_ids=inputs_non_padded) - - num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item() - inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device) - output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) - - batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) - non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) - padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) - - expected_output_sentence = [ - "Hello, my dog is a little bit of a shy one, but he is very friendly", - "Today, I am going to share with you a few of my favorite things", - ] - self.assertListEqual(expected_output_sentence, batch_out_sentence) - self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) - @slow def test_model_from_pretrained(self): for model_name in XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: @@ -409,6 +369,49 @@ class XGLMModelLanguageGenerationTest(unittest.TestCase): if verify_outputs: self.assertListEqual(output_ids[0].tolist(), expected_output_ids) + @slow + def test_batch_generation(self): + model = XGLMForCausalLM.from_pretrained("facebook/xglm-564M") + model.to(torch_device) + tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M") + + tokenizer.padding_side = "left" + + # use different length sentences to test batching + sentences = [ + "This is an extremelly long sentence that only exists to test the ability of the model to cope with " + "left-padding, such as in batched generation. The output for the sequence below should be the same " + "regardless of whether left padding is applied or not. When", + "Hello, my dog is a little", + ] + + inputs = tokenizer(sentences, return_tensors="pt", padding=True) + input_ids = inputs["input_ids"].to(torch_device) + + outputs = model.generate( + input_ids=input_ids, attention_mask=inputs["attention_mask"].to(torch_device), max_new_tokens=12 + ) + + inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device) + output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=12) + + inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device) + output_padded = model.generate(input_ids=inputs_padded, max_new_tokens=12) + + batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) + non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) + padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) + + expected_output_sentence = [ + "This is an extremelly long sentence that only exists to test the ability of the model to cope with " + "left-padding, such as in batched generation. The output for the sequence below should be the same " + "regardless of whether left padding is applied or not. When left padding is applied, the sequence will be " + "a single", + "Hello, my dog is a little bit of a shy one, but he is very friendly", + ] + self.assertListEqual(expected_output_sentence, batch_out_sentence) + self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) + @slow def test_lm_generate_xglm(self): self._test_lm_generate_xglm_helper()