Tests: upcast logits to float() (#34042)

upcast
This commit is contained in:
Joao Gante
2024-10-11 11:51:49 +01:00
committed by GitHub
parent 4b9bfd32f0
commit e878eaa9fc
11 changed files with 35 additions and 19 deletions

View File

@@ -538,7 +538,7 @@ class GraniteIntegrationTest(unittest.TestCase):
self.assertTrue( self.assertTrue(
torch.allclose( torch.allclose(
EXPECTED_SLICE.to(torch_device), EXPECTED_SLICE.to(torch_device),
out.logits[0, 0, :15], out.logits[0, 0, :15].float(),
atol=1e-3, atol=1e-3,
rtol=1e-3, rtol=1e-3,
) )
@@ -558,4 +558,4 @@ class GraniteIntegrationTest(unittest.TestCase):
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-2.0984, -3.1294, -2.8153, -2.3568, -2.7337, -2.2624, -2.6016, -2.4022]]) EXPECTED_MEAN = torch.tensor([[-2.0984, -3.1294, -2.8153, -2.3568, -2.7337, -2.2624, -2.6016, -2.4022]])
self.assertTrue(torch.allclose(EXPECTED_MEAN.to(torch_device), out.logits.mean(-1), atol=1e-2, rtol=1e-2)) self.assertTrue(torch.allclose(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), atol=1e-2, rtol=1e-2))

View File

@@ -525,7 +525,9 @@ class GraniteMoeIntegrationTest(unittest.TestCase):
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]]) EXPECTED_MEAN = torch.tensor([[-2.2122, -1.6632, -2.9269, -2.3344, -2.0143, -3.0146, -2.6839, -2.5610]])
self.assertTrue(torch.allclose(EXPECTED_MEAN.to(torch_device), out.logits.mean(-1), atol=1e-2, rtol=1e-2)) self.assertTrue(
torch.allclose(EXPECTED_MEAN.to(torch_device), out.logits.float().mean(-1), atol=1e-2, rtol=1e-2)
)
# slicing logits[0, 0, 0:15] # slicing logits[0, 0, 0:15]
EXPECTED_SLICE = torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892, EXPECTED_SLICE = torch.tensor([[4.8785, -2.2890, -2.2892, -2.2885, -2.2890, -3.5007, -2.2897, -2.2892,
@@ -535,7 +537,7 @@ class GraniteMoeIntegrationTest(unittest.TestCase):
self.assertTrue( self.assertTrue(
torch.allclose( torch.allclose(
EXPECTED_SLICE.to(torch_device), EXPECTED_SLICE.to(torch_device),
out.logits[0, 0, :15], out.logits[0, 0, :15].float(),
atol=1e-3, atol=1e-3,
rtol=1e-3, rtol=1e-3,
) )

View File

@@ -481,7 +481,7 @@ class JetMoeIntegrationTest(unittest.TestCase):
model = JetMoeForCausalLM.from_pretrained("jetmoe/jetmoe-8b") model = JetMoeForCausalLM.from_pretrained("jetmoe/jetmoe-8b")
input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
with torch.no_grad(): with torch.no_grad():
out = model(input_ids).logits.cpu() out = model(input_ids).logits.float().cpu()
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[0.2507, -2.7073, -1.3445, -1.9363, -1.7216, -1.7370, -1.9054, -1.9792]]) EXPECTED_MEAN = torch.tensor([[0.2507, -2.7073, -1.3445, -1.9363, -1.7216, -1.7370, -1.9054, -1.9792]])
torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2) torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)

View File

@@ -773,7 +773,14 @@ class LlamaIntegrationTest(unittest.TestCase):
8: torch.tensor([[-6.5208, -4.1218, -4.9377, -3.2536, 0.8127, -2.9811, 1.2918, -3.3848]]) 8: torch.tensor([[-6.5208, -4.1218, -4.9377, -3.2536, 0.8127, -2.9811, 1.2918, -3.3848]])
} }
self.assertTrue(torch.allclose(EXPECTED_MEAN[self.cuda_compute_capability_major_version].to(torch_device), out.logits.mean(-1), atol=1e-2, rtol=1e-2)) self.assertTrue(
torch.allclose(
EXPECTED_MEAN[self.cuda_compute_capability_major_version].to(torch_device),
out.logits.float().mean(-1),
atol=1e-2,
rtol=1e-2
)
)
# slicing logits[0, 0, 0:15] # slicing logits[0, 0, 0:15]
EXPECTED_SLICE = { EXPECTED_SLICE = {
@@ -785,7 +792,7 @@ class LlamaIntegrationTest(unittest.TestCase):
self.assertTrue( self.assertTrue(
torch.allclose( torch.allclose(
EXPECTED_SLICE[self.cuda_compute_capability_major_version].to(torch_device), EXPECTED_SLICE[self.cuda_compute_capability_major_version].to(torch_device),
out.logits[0, 0, :15], out.logits[0, 0, :15].float(),
atol=1e-2, atol=1e-2,
rtol=1e-2, rtol=1e-2,
) )
@@ -810,7 +817,14 @@ class LlamaIntegrationTest(unittest.TestCase):
8: torch.tensor([[-6.6544, -4.1259, -4.9840, -3.2456, 0.8261, -3.0124, 1.2971, -3.3641]]) 8: torch.tensor([[-6.6544, -4.1259, -4.9840, -3.2456, 0.8261, -3.0124, 1.2971, -3.3641]])
} }
self.assertTrue(torch.allclose(EXPECTED_MEAN[self.cuda_compute_capability_major_version].to(torch_device), out.logits.mean(-1), atol=1e-2, rtol=1e-2)) self.assertTrue(
torch.allclose(
EXPECTED_MEAN[self.cuda_compute_capability_major_version].to(torch_device),
out.logits.float().mean(-1),
atol=1e-2,
rtol=1e-2
)
)
# slicing logits[0, 0, 0:15] # slicing logits[0, 0, 0:15]
EXPECTED_SLICE = { EXPECTED_SLICE = {
@@ -822,7 +836,7 @@ class LlamaIntegrationTest(unittest.TestCase):
self.assertTrue( self.assertTrue(
torch.allclose( torch.allclose(
EXPECTED_SLICE[self.cuda_compute_capability_major_version].to(torch_device), EXPECTED_SLICE[self.cuda_compute_capability_major_version].to(torch_device),
out.logits[0, 0, :15], out.logits[0, 0, :15].float(),
atol=1e-2, atol=1e-2,
rtol=1e-2, rtol=1e-2,
) )

View File

@@ -524,7 +524,7 @@ class MistralIntegrationTest(unittest.TestCase):
) )
input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
with torch.no_grad(): with torch.no_grad():
out = model(input_ids).logits.cpu() out = model(input_ids).logits.float().cpu()
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]]) EXPECTED_MEAN = torch.tensor([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]])
torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2) torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)

View File

@@ -360,7 +360,7 @@ class OlmoIntegrationTest(unittest.TestCase):
def test_model_1b_logits(self): def test_model_1b_logits(self):
input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]] input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]]
model = OlmoForCausalLM.from_pretrained("allenai/OLMo-1B-hf", device_map="auto") model = OlmoForCausalLM.from_pretrained("allenai/OLMo-1B-hf", device_map="auto")
out = model(torch.tensor(input_ids)).logits out = model(torch.tensor(input_ids)).logits.float()
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[2.2869, 0.3315, 0.9876, 1.4146, 1.8804, 2.0430, 1.7055, 1.2065]]) EXPECTED_MEAN = torch.tensor([[2.2869, 0.3315, 0.9876, 1.4146, 1.8804, 2.0430, 1.7055, 1.2065]])
torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2) torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
@@ -372,7 +372,7 @@ class OlmoIntegrationTest(unittest.TestCase):
def test_model_7b_logits(self): def test_model_7b_logits(self):
input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]] input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]]
model = OlmoForCausalLM.from_pretrained("allenai/OLMo-7B-hf", device_map="auto") model = OlmoForCausalLM.from_pretrained("allenai/OLMo-7B-hf", device_map="auto")
out = model(torch.tensor(input_ids)).logits out = model(torch.tensor(input_ids)).logits.float()
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[0.0271, 0.0249, -0.0578, -0.0870, 0.0167, 0.0710, 0.1002, 0.0677]]) EXPECTED_MEAN = torch.tensor([[0.0271, 0.0249, -0.0578, -0.0870, 0.0167, 0.0710, 0.1002, 0.0677]])
torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2) torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
@@ -384,7 +384,7 @@ class OlmoIntegrationTest(unittest.TestCase):
def test_model_7b_twin_2t_logits(self): def test_model_7b_twin_2t_logits(self):
input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]] input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]]
model = OlmoForCausalLM.from_pretrained("allenai/OLMo-7B-Twin-2T-hf", device_map="auto") model = OlmoForCausalLM.from_pretrained("allenai/OLMo-7B-Twin-2T-hf", device_map="auto")
out = model(torch.tensor(input_ids)).logits out = model(torch.tensor(input_ids)).logits.float()
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-0.3636, -0.3825, -0.4800, -0.3696, -0.8388, -0.9737, -0.9849, -0.8356]]) EXPECTED_MEAN = torch.tensor([[-0.3636, -0.3825, -0.4800, -0.3696, -0.8388, -0.9737, -0.9849, -0.8356]])
torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2) torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)

View File

@@ -375,7 +375,7 @@ class OlmoeIntegrationTest(unittest.TestCase):
def test_model_7b_logits(self): def test_model_7b_logits(self):
input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]] input_ids = [[1, 306, 4658, 278, 6593, 310, 2834, 338]]
model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924", device_map="auto") model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924", device_map="auto")
out = model(torch.tensor(input_ids)).logits out = model(torch.tensor(input_ids)).logits.float()
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-1.3814, -3.4450, -2.2990, -1.9542, -2.4387, -2.7941, -2.9312, -2.8309]]) EXPECTED_MEAN = torch.tensor([[-1.3814, -3.4450, -2.2990, -1.9542, -2.4387, -2.7941, -2.9312, -2.8309]])
torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2) torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)

View File

@@ -496,7 +496,7 @@ class PersimmonIntegrationTest(unittest.TestCase):
model = PersimmonForCausalLM.from_pretrained( model = PersimmonForCausalLM.from_pretrained(
"adept/persimmon-8b-chat", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.float16 "adept/persimmon-8b-chat", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.float16
) )
out = model(torch.tensor([input_ids], device=torch_device)).logits out = model(torch.tensor([input_ids], device=torch_device)).logits.float()
EXPECTED_MEAN = torch.tensor( EXPECTED_MEAN = torch.tensor(
[[-11.4726, -11.1495, -11.2694, -11.2223, -10.9452, -11.0663, -11.0031, -11.1028]] [[-11.4726, -11.1495, -11.2694, -11.2223, -10.9452, -11.0663, -11.0031, -11.1028]]

View File

@@ -518,7 +518,7 @@ class Qwen2IntegrationTest(unittest.TestCase):
model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2-450m-beta", device_map="auto") model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2-450m-beta", device_map="auto")
input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
with torch.no_grad(): with torch.no_grad():
out = model(input_ids).logits.cpu() out = model(input_ids).logits.float().cpu()
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]]) EXPECTED_MEAN = torch.tensor([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]])
torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2) torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)

View File

@@ -580,7 +580,7 @@ class Qwen2MoeIntegrationTest(unittest.TestCase):
model = Qwen2MoeForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", device_map="auto") model = Qwen2MoeForCausalLM.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B", device_map="auto")
input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
with torch.no_grad(): with torch.no_grad():
out = model(input_ids).logits.cpu() out = model(input_ids).logits.float().cpu()
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-4.2125, -3.6416, -4.9136, -4.3005, -4.9938, -3.4393, -3.5195, -4.1621]]) EXPECTED_MEAN = torch.tensor([[-4.2125, -3.6416, -4.9136, -4.3005, -4.9938, -3.4393, -3.5195, -4.1621]])
torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2) torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)

View File

@@ -482,7 +482,7 @@ class StableLmModelIntegrationTest(unittest.TestCase):
model = StableLmForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t").to(torch_device) model = StableLmForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t").to(torch_device)
model.eval() model.eval()
output = model(**input_ids).logits output = model(**input_ids).logits.float()
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[2.7146, 2.4245, 1.5616, 1.4424, 2.6790]]).to(torch_device) EXPECTED_MEAN = torch.tensor([[2.7146, 2.4245, 1.5616, 1.4424, 2.6790]]).to(torch_device)
@@ -515,7 +515,7 @@ class StableLmModelIntegrationTest(unittest.TestCase):
model = StableLmForCausalLM.from_pretrained("stabilityai/tiny-random-stablelm-2").to(torch_device) model = StableLmForCausalLM.from_pretrained("stabilityai/tiny-random-stablelm-2").to(torch_device)
model.eval() model.eval()
output = model(**input_ids).logits output = model(**input_ids).logits.float()
# Expected mean on dim = -1 # Expected mean on dim = -1
EXPECTED_MEAN = torch.tensor([[-2.7196, -3.6099, -2.6877, -3.1973, -3.9344]]).to(torch_device) EXPECTED_MEAN = torch.tensor([[-2.7196, -3.6099, -2.6877, -3.1973, -3.9344]]).to(torch_device)