Align gpt2 mask preparation to #37612 (#37787)

Update modeling_gpt2.py
2025-04-25 12:50:30 +02:00
parent 50d231a806
commit ba3bd37253
1 changed files with 3 additions and 7 deletions
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -1119,7 +1119,7 @@ class GPT2Model(GPT2PreTrainedModel):
            ):
                return None

-        dtype, device = input_tensor.dtype, input_tensor.device
+        dtype = input_tensor.dtype
        sequence_length = input_tensor.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
@@ -1136,7 +1136,6 @@ class GPT2Model(GPT2PreTrainedModel):
            sequence_length=sequence_length,
            target_length=target_length,
            dtype=dtype,
-            device=device,
            cache_position=cache_position,
            batch_size=input_tensor.shape[0],
        )
@@ -1161,7 +1160,6 @@ class GPT2Model(GPT2PreTrainedModel):
        sequence_length: int,
        target_length: int,
        dtype: torch.dtype,
-        device: torch.device,
        cache_position: torch.Tensor,
        batch_size: int,
        **kwargs,
@@ -1181,8 +1179,6 @@ class GPT2Model(GPT2PreTrainedModel):
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
-            device (`torch.device`):
-                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
@@ -1194,11 +1190,11 @@ class GPT2Model(GPT2PreTrainedModel):
        else:
            min_dtype = torch.finfo(dtype).min
            causal_mask = torch.full(
-                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit