Align gpt2 mask preparation to #37612 (#37787)

Update modeling_gpt2.py
This commit is contained in:
Cyril Vallez
2025-04-25 12:50:30 +02:00
committed by GitHub
parent 50d231a806
commit ba3bd37253

View File

@@ -1119,7 +1119,7 @@ class GPT2Model(GPT2PreTrainedModel):
):
return None
dtype, device = input_tensor.dtype, input_tensor.device
dtype = input_tensor.dtype
sequence_length = input_tensor.shape[1]
if using_static_cache:
target_length = past_key_values.get_max_cache_shape()
@@ -1136,7 +1136,6 @@ class GPT2Model(GPT2PreTrainedModel):
sequence_length=sequence_length,
target_length=target_length,
dtype=dtype,
device=device,
cache_position=cache_position,
batch_size=input_tensor.shape[0],
)
@@ -1161,7 +1160,6 @@ class GPT2Model(GPT2PreTrainedModel):
sequence_length: int,
target_length: int,
dtype: torch.dtype,
device: torch.device,
cache_position: torch.Tensor,
batch_size: int,
**kwargs,
@@ -1181,8 +1179,6 @@ class GPT2Model(GPT2PreTrainedModel):
to account for the 0 padding, the part of the cache that is not filled yet.
dtype (`torch.dtype`):
The dtype to use for the 4D attention mask.
device (`torch.device`):
The device to plcae the 4D attention mask on.
cache_position (`torch.Tensor`):
Indices depicting the position of the input sequence tokens in the sequence.
batch_size (`torch.Tensor`):
@@ -1194,11 +1190,11 @@ class GPT2Model(GPT2PreTrainedModel):
else:
min_dtype = torch.finfo(dtype).min
causal_mask = torch.full(
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
)
if sequence_length != 1:
causal_mask = torch.triu(causal_mask, diagonal=1)
causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
if attention_mask is not None:
causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit