[OPT] Run test in lower precision on GPU (#17353)
* [OPT] Run test only in half precision * up * up * up * up * finish * fix on GPU * Update tests/models/opt/test_modeling_opt.py
This commit is contained in:
committed by
GitHub
parent
2b282296f1
commit
e8714c0307
@@ -17,9 +17,8 @@ import random
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.checkpoint
|
||||
from torch import Tensor, nn
|
||||
from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
@@ -86,52 +85,28 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
|
||||
return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
|
||||
|
||||
|
||||
def make_positions(mask, padding_idx: int):
|
||||
"""Replace non-padding symbols with their position numbers.
|
||||
|
||||
Position numbers begin at padding_idx+1. Padding symbols are ignored.
|
||||
"""
|
||||
# The series of casts and type-conversions here are carefully
|
||||
# balanced to both work with ONNX export and XLA. In particular XLA
|
||||
# prefers ints, cumsum defaults to output longs, and ONNX doesn't know
|
||||
# how to handle the dtype kwarg in cumsum.
|
||||
positions = (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
|
||||
return positions
|
||||
|
||||
|
||||
class OPTLearnedPositionalEmbedding(nn.Embedding):
|
||||
"""
|
||||
This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
|
||||
based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
|
||||
the forward function.
|
||||
This module learns positional embeddings up to a fixed maximum size.
|
||||
"""
|
||||
|
||||
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int = 1):
|
||||
super().__init__(num_embeddings, embedding_dim, padding_idx)
|
||||
self.onnx_trace = False
|
||||
if self.padding_idx is not None:
|
||||
self.max_positions = self.num_embeddings - self.padding_idx - 1
|
||||
else:
|
||||
self.max_positions = self.num_embeddings
|
||||
def __init__(self, num_embeddings: int, embedding_dim: int):
|
||||
# OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
|
||||
# and adjust num_embeddings appropriately. Other models don't have this hack
|
||||
self.offset = 2
|
||||
super().__init__(num_embeddings + self.offset, embedding_dim)
|
||||
|
||||
def forward(self, attention_mask: Tensor, positions: Optional[Tensor] = None):
|
||||
# attention_masks is expected to be of size [batch_size x seq_len].
|
||||
if not ((positions is None) or (self.padding_idx is None)):
|
||||
raise ValueError("If positions is pre-computed then padding_idx should not be set.")
|
||||
def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
|
||||
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
|
||||
attention_mask = attention_mask.long()
|
||||
|
||||
if positions is None:
|
||||
attention_mask = attention_mask.long()
|
||||
positions = make_positions(attention_mask, self.padding_idx)
|
||||
# create positions depending on attention_mask
|
||||
positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
|
||||
|
||||
return F.embedding(
|
||||
positions,
|
||||
self.weight,
|
||||
self.padding_idx,
|
||||
self.max_norm,
|
||||
self.norm_type,
|
||||
self.scale_grad_by_freq,
|
||||
self.sparse,
|
||||
)
|
||||
# cut positions if `past_key_values_length` is > 0
|
||||
positions = positions[:, past_key_values_length:]
|
||||
|
||||
return super().forward(positions + self.offset)
|
||||
|
||||
|
||||
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->OPT
|
||||
@@ -504,12 +479,7 @@ class OPTDecoder(OPTPreTrainedModel):
|
||||
self.vocab_size = config.vocab_size
|
||||
|
||||
self.embed_tokens = nn.Embedding(config.vocab_size, config.word_embed_proj_dim, self.padding_idx)
|
||||
|
||||
# OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
|
||||
if self.padding_idx is not None:
|
||||
num_embeddings = config.max_position_embeddings + 2
|
||||
|
||||
self.embed_positions = OPTLearnedPositionalEmbedding(num_embeddings, config.hidden_size, self.padding_idx)
|
||||
self.embed_positions = OPTLearnedPositionalEmbedding(config.max_position_embeddings, config.hidden_size)
|
||||
|
||||
if config.word_embed_proj_dim != config.hidden_size:
|
||||
self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
|
||||
@@ -639,8 +609,7 @@ class OPTDecoder(OPTPreTrainedModel):
|
||||
# embed positions
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.bool, device=inputs_embeds.device)
|
||||
|
||||
positions = self.embed_positions(attention_mask)[:, past_key_values_length:, :]
|
||||
pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
|
||||
|
||||
attention_mask = self._prepare_decoder_attention_mask(
|
||||
attention_mask, input_shape, inputs_embeds, past_key_values_length
|
||||
@@ -649,8 +618,7 @@ class OPTDecoder(OPTPreTrainedModel):
|
||||
if self.project_in is not None:
|
||||
inputs_embeds = self.project_in(inputs_embeds)
|
||||
|
||||
hidden_states = inputs_embeds + positions
|
||||
|
||||
hidden_states = inputs_embeds + pos_embeds
|
||||
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
|
||||
# decoder layers
|
||||
@@ -671,6 +639,7 @@ class OPTDecoder(OPTPreTrainedModel):
|
||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
dropout_probability = random.uniform(0, 1)
|
||||
if self.training and (dropout_probability < self.layerdrop):
|
||||
continue
|
||||
|
||||
@@ -271,14 +271,25 @@ class OPTModelIntegrationTests(unittest.TestCase):
|
||||
def test_inference_no_head(self):
|
||||
model = OPTModel.from_pretrained("facebook/opt-350m").to(torch_device)
|
||||
input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
|
||||
|
||||
with torch.no_grad():
|
||||
output = model(input_ids=input_ids).last_hidden_state
|
||||
|
||||
expected_shape = torch.Size((1, 11, 512))
|
||||
self.assertEqual(output.shape, expected_shape)
|
||||
expected_slice = torch.tensor(
|
||||
[[-0.2867, -1.9256, -0.3062], [-1.2711, -0.1337, -0.1897], [0.4109, 0.1187, -1.3142]], device=torch_device
|
||||
[[-0.2873, -1.9242, -0.3059], [-1.2738, -0.1333, -0.1877], [0.4116, 0.1192, -1.3107]],
|
||||
device=torch_device,
|
||||
)
|
||||
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
|
||||
# Getting different logits results on GPU depending on PyTorch version (1.10+cu11.0 vs. 1.11+cu11.4)
|
||||
# and results also differ between CPU and GPU. Only on CPU it seems to be deterministic.
|
||||
|
||||
# It's not because the weights are saved & loaded in FP16
|
||||
# checked that the same happens when weights are stored in fp32 and loaded in fp32.
|
||||
# The differences start to creep in in the first linear projection matrix project_in_dim
|
||||
# It however also happens for BART (maybe related to training model in fp16?)
|
||||
atol = 1e-2 if torch_device != "cpu" else 1e-3
|
||||
assert_tensors_close(output[0, :3, :3], expected_slice, atol=atol)
|
||||
|
||||
|
||||
@require_torch
|
||||
|
||||
Reference in New Issue
Block a user