From 641238eb766ecac073e985dbfb926fd55a55600f Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 21 Jan 2025 00:19:31 +0800 Subject: [PATCH] Fix vits low-precision dtype (#35418) * fix vits dtype Signed-off-by: jiqing-feng * add tests Signed-off-by: jiqing-feng * use weight dtype Signed-off-by: jiqing-feng --------- Signed-off-by: jiqing-feng --- src/transformers/models/vits/modeling_vits.py | 5 +-- tests/models/vits/test_modeling_vits.py | 32 +++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py index 4f42c0405d..7a506d497f 100644 --- a/src/transformers/models/vits/modeling_vits.py +++ b/src/transformers/models/vits/modeling_vits.py @@ -1406,10 +1406,11 @@ class VitsModel(VitsPreTrainedModel): if labels is not None: raise NotImplementedError("Training of VITS is not supported yet.") + mask_dtype = self.text_encoder.embed_tokens.weight.dtype if attention_mask is not None: - input_padding_mask = attention_mask.unsqueeze(-1).float() + input_padding_mask = attention_mask.unsqueeze(-1).to(mask_dtype) else: - input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float() + input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).to(mask_dtype) if self.config.num_speakers > 1 and speaker_id is not None: if not 0 <= speaker_id < self.config.num_speakers: diff --git a/tests/models/vits/test_modeling_vits.py b/tests/models/vits/test_modeling_vits.py index 3661940909..9733fb4bce 100644 --- a/tests/models/vits/test_modeling_vits.py +++ b/tests/models/vits/test_modeling_vits.py @@ -27,6 +27,7 @@ from transformers.testing_utils import ( is_flaky, is_torch_available, require_torch, + require_torch_fp16, require_torch_multi_gpu, slow, torch_device, @@ -434,3 +435,34 @@ class VitsModelIntegrationTests(unittest.TestCase): ) # fmt: on self.assertTrue(torch.allclose(outputs.waveform[0, 10000:10030].cpu(), EXPECTED_LOGITS, atol=1e-4)) + + @require_torch_fp16 + def test_forward_fp16(self): + # GPU gives different results than CPU + torch_device = "cpu" + + model = VitsModel.from_pretrained("facebook/mms-tts-eng", torch_dtype=torch.float16) + model.to(torch_device) + + tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") + + set_seed(555) # make deterministic + + input_text = "Mister quilter is the apostle of the middle classes and we are glad to welcome his gospel!" + input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(torch_device) + + with torch.no_grad(): + outputs = model(input_ids) + + self.assertEqual(outputs.waveform.shape, (1, 87040)) + # fmt: off + EXPECTED_LOGITS = torch.tensor( + [ + 0.0101, 0.0318, 0.0489, 0.0627, 0.0728, 0.0865, 0.1053, 0.1279, + 0.1514, 0.1703, 0.1827, 0.1829, 0.1694, 0.1509, 0.1332, 0.1188, + 0.1066, 0.0978, 0.0936, 0.0867, 0.0724, 0.0493, 0.0197, -0.0141, + -0.0501, -0.0817, -0.1065, -0.1223, -0.1311, -0.1339 + ] + ).to(torch.float16) + # fmt: on + self.assertTrue(torch.allclose(outputs.waveform[0, 10000:10030].cpu(), EXPECTED_LOGITS, atol=1e-4))