Fix more inefficient PT operations (#37060)

* Fix inefficient operations

* Remove cpu() call

* Reorder detach()

* Reorder detach()

* tolist without detach

* item without detach

* Update src/transformers/models/rag/modeling_rag.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update tests/models/encodec/test_modeling_encodec.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Use detach().cpu().numpy

* Revert some numpy operations

* More fixes

---------

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
cyyever
2025-03-31 23:31:24 +08:00
committed by GitHub
parent a1e389e637
commit 786d9c5ed9
54 changed files with 106 additions and 104 deletions

View File

@@ -363,7 +363,7 @@ class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
output_non_padded = model.generate(input_ids=inputs_non_padded)
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)

View File

@@ -406,7 +406,7 @@ class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
output_non_padded = model.generate(input_ids=inputs_non_padded)
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)

View File

@@ -352,7 +352,7 @@ class Data2VecVisionModelIntegrationTest(unittest.TestCase):
torch.testing.assert_close(logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
expected_top2 = [model.config.label2id[i] for i in ["remote control, remote", "tabby, tabby cat"]]
self.assertEqual(logits[0].topk(2).indices.cpu().tolist(), expected_top2)
self.assertEqual(logits[0].topk(2).indices.tolist(), expected_top2)
@slow
def test_inference_interpolate_pos_encoding(self):

View File

@@ -117,7 +117,7 @@ class EncodecModelTester:
config.normalize = True
processor = EncodecFeatureExtractor(feature_size=config.audio_channels, sampling_rate=config.sampling_rate)
input_values = list(input_values.cpu().numpy())
input_values = input_values.tolist()
inputs_dict = processor(
input_values, sampling_rate=config.sampling_rate, padding=True, return_tensors="pt"
).to(torch_device)
@@ -495,7 +495,7 @@ class EncodecIntegrationTest(unittest.TestCase):
# use max bandwidth for best possible reconstruction
encoder_outputs = model.encode(inputs["input_values"], bandwidth=float(bandwidth))
audio_code_sums = [a[0].sum().cpu().item() for a in encoder_outputs[0]]
audio_code_sums = [a[0].sum().item() for a in encoder_outputs[0]]
# make sure audio encoded codes are correct
self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
@@ -552,7 +552,7 @@ class EncodecIntegrationTest(unittest.TestCase):
encoder_outputs = model.encode(
inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth), return_dict=False
)
audio_code_sums = [a[0].sum().cpu().item() for a in encoder_outputs[0]]
audio_code_sums = [a[0].sum().item() for a in encoder_outputs[0]]
# make sure audio encoded codes are correct
self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
@@ -610,8 +610,8 @@ class EncodecIntegrationTest(unittest.TestCase):
with torch.no_grad():
# use max bandwidth for best possible reconstruction
encoder_outputs = model.encode(input_values, bandwidth=float(bandwidth), return_dict=False)
audio_code_sums_0 = [a[0][0].sum().cpu().item() for a in encoder_outputs[0]]
audio_code_sums_1 = [a[0][1].sum().cpu().item() for a in encoder_outputs[0]]
audio_code_sums_0 = [a[0][0].sum().item() for a in encoder_outputs[0]]
audio_code_sums_1 = [a[0][1].sum().item() for a in encoder_outputs[0]]
# make sure audio encoded codes are correct
self.assertListEqual(audio_code_sums_0, expected_codesums[bandwidth][0])

View File

@@ -662,7 +662,7 @@ class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
output_non_padded = model.generate(input_ids=inputs_non_padded, max_length=20)
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
@@ -724,7 +724,7 @@ class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
output_non_padded = model.generate(input_ids=inputs_non_padded, max_length=20)
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)

View File

@@ -552,7 +552,7 @@ class GPTNeoModelLanguageGenerationTest(unittest.TestCase):
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
output_non_padded = model.generate(input_ids=inputs_non_padded)
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)

View File

@@ -466,7 +466,7 @@ class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
output_non_padded = model.generate(input_ids=inputs_non_padded)
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)

View File

@@ -540,7 +540,7 @@ class MimiIntegrationTest(unittest.TestCase):
# use max bandwidth for best possible reconstruction
encoder_outputs = model.encode(inputs["input_values"], num_quantizers=int(num_codebooks))
audio_code_sums = encoder_outputs[0].sum().cpu().item()
audio_code_sums = encoder_outputs[0].sum().item()
# make sure audio encoded codes are correct
# assert relative difference less than a threshold, because `audio_code_sums` varies a bit

View File

@@ -951,8 +951,8 @@ class MoshiIntegrationTests(unittest.TestCase):
expected_text_token = 452
expected_audio_tokens = [916, 1396, 1238, 579, 1105, 914, 1257, 810] # fmt: skip
self.assertTrue(expected_text_token == model_outputs.sequences[0, -2].cpu().item())
self.assertTrue(expected_audio_tokens == model_outputs.audio_codes[0, :, -1].cpu().tolist())
self.assertTrue(expected_text_token == model_outputs.sequences[0, -2].item())
self.assertTrue(expected_audio_tokens == model_outputs.audio_codes[0, :, -1].tolist())
@slow
def test_moshiko_greedy_unconditional_fp16_eager(self):
@@ -966,7 +966,7 @@ class MoshiIntegrationTests(unittest.TestCase):
)
# eager equivalence is not as strict as sdpa.
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())
@slow
def test_moshiko_greedy_unconditional_fp32(self):
@@ -986,8 +986,8 @@ class MoshiIntegrationTests(unittest.TestCase):
audio_code_sums = model_outputs.audio_codes.sum().item()
self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= (3e-3 * audio_code_sums))
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist())
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].tolist())
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())
@slow
@require_torch_fp16
@@ -1008,8 +1008,8 @@ class MoshiIntegrationTests(unittest.TestCase):
audio_code_sums = model_outputs.audio_codes.sum().item()
self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= (3e-3 * audio_code_sums))
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist())
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].tolist())
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())
@slow
@require_torch_fp16
@@ -1030,5 +1030,5 @@ class MoshiIntegrationTests(unittest.TestCase):
audio_code_sums = model_outputs.audio_codes.sum().item()
self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= 2048)
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist())
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].tolist())
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())

View File

@@ -486,7 +486,7 @@ class OPTGenerationTest(unittest.TestCase):
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
output_non_padded = model.generate(input_ids=inputs_non_padded)
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)

View File

@@ -989,7 +989,7 @@ class Owlv2ModelIntegrationTest(unittest.TestCase):
outputs, text_labels=text_labels
)
objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
objects_labels = post_processed_output_with_text_labels[0]["labels"].tolist()
self.assertListEqual(objects_labels, [0, 0])
objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]

View File

@@ -975,7 +975,7 @@ class OwlViTModelIntegrationTest(unittest.TestCase):
outputs, text_labels=text_labels
)
objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
objects_labels = post_processed_output_with_text_labels[0]["labels"].tolist()
self.assertListEqual(objects_labels, [0, 0])
objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]

View File

@@ -311,7 +311,7 @@ class RagTestMixin:
out = retriever(
input_ids,
question_hidden_states.cpu().detach().to(torch.float32).numpy(),
question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
prefix=config.generator.prefix,
return_tensors="pt",
)
@@ -379,7 +379,7 @@ class RagTestMixin:
out = retriever(
input_ids,
question_hidden_states.cpu().detach().to(torch.float32).numpy(),
question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
prefix=config.generator.prefix,
return_tensors="pt",
)
@@ -438,7 +438,7 @@ class RagTestMixin:
out = retriever(
input_ids,
question_hidden_states.cpu().detach().to(torch.float32).numpy(),
question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
prefix=config.generator.prefix,
return_tensors="pt",
n_docs=n_docs,
@@ -507,7 +507,7 @@ class RagTestMixin:
out = retriever(
input_ids,
question_hidden_states.cpu().detach().to(torch.float32).numpy(),
question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
prefix=config.generator.prefix,
return_tensors="pt",
n_docs=retriever_n_docs,
@@ -964,7 +964,7 @@ class RagModelIntegrationTests(unittest.TestCase):
question_hidden_states = rag_sequence.question_encoder(input_ids, attention_mask=attention_mask)[0]
docs_dict = retriever(
input_ids.cpu().detach().numpy(), question_hidden_states.cpu().detach().numpy(), return_tensors="pt"
input_ids.detach().cpu().numpy(), question_hidden_states.detach().cpu().numpy(), return_tensors="pt"
)
doc_scores = torch.bmm(
question_hidden_states.unsqueeze(1),

View File

@@ -1044,7 +1044,7 @@ class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase):
).loss
# loss_more_masked has to be bigger or equal loss since more masked inputs have to be predicted
self.assertTrue(loss.detach().item() <= loss_more_masked.detach().item())
self.assertTrue(loss.item() <= loss_more_masked.item())
def test_mask_feature_prob_ctc(self):
model = Wav2Vec2ForCTC.from_pretrained(

View File

@@ -670,7 +670,7 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
#
# input_values = feature_extractor(sample["audio"]["array"], return_tensors="pt").input_values
# logits = model(input_values).logits
# pred_ids = torch.argmax(logits, axis=-1).cpu().tolist()
# pred_ids = torch.argmax(logits, axis=-1).tolist()
# ```
# fmt: off
pred_ids = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 11, 0, 0, 0, 22, 0, 0, 4, 4, 4, 14, 0, 0, 0, 0, 0, 8, 8, 0, 5, 5, 0, 12, 0, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 10, 0, 0, 0, 15, 0, 0, 10, 0, 0, 0, 12, 0, 0, 0, 0, 0, 7, 0, 9, 0, 0, 14, 0, 0, 0, 13, 0, 7, 0, 0, 4, 4, 0, 15, 8, 8, 0, 0, 8, 0, 26, 0, 0, 4, 4, 0, 0, 15, 0, 0, 0, 0, 0, 0, 10, 0, 26, 5, 5, 0, 4, 4, 0, 0, 12, 11, 0, 0, 5, 4, 4, 4, 0, 18, 0, 0, 0, 7, 9, 9, 0, 6, 0, 12, 12, 4, 4, 0, 6, 0, 0, 8, 0, 4, 4, 4, 0, 19, 0, 0, 8, 9, 9, 0, 0, 0, 0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 16, 16, 0, 0, 17, 5, 5, 5, 0, 4, 4, 4, 0, 0, 29, 29, 0, 0, 0, 0, 8, 11, 0, 9, 9, 0, 0, 0, 4, 4, 0, 12, 12, 0, 0, 0, 9, 0, 0, 0, 0, 0, 8, 18, 0, 0, 0, 4, 4, 0, 0, 8, 9, 0, 4, 4, 0, 6, 11, 5, 0, 4, 4, 0, 13, 13, 0, 0, 0, 10, 0, 0, 25, 0, 0, 6, 0, 4, 4, 0, 0, 0, 0, 7, 0, 0, 23, 0, 0, 4, 4, 0, 0, 0, 6, 11, 0, 5, 4, 4, 18, 0, 0, 0, 0, 0, 0, 7, 15, 0, 0, 0, 15, 15, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

View File

@@ -527,4 +527,4 @@ class XLMModelLanguageGenerationTest(unittest.TestCase):
] # the president the president the president the president the president the president the president the president the president the president
# TODO(PVP): this and other input_ids I tried for generation give pretty bad results. Not sure why. Model might just not be made for auto-regressive inference
output_ids = model.generate(input_ids, do_sample=False)
self.assertListEqual(output_ids[0].cpu().numpy().tolist(), expected_output_ids)
self.assertListEqual(output_ids[0].tolist(), expected_output_ids)