Fix more inefficient PT operations (#37060)
* Fix inefficient operations * Remove cpu() call * Reorder detach() * Reorder detach() * tolist without detach * item without detach * Update src/transformers/models/rag/modeling_rag.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/models/encodec/test_modeling_encodec.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Use detach().cpu().numpy * Revert some numpy operations * More fixes --------- Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
@@ -363,7 +363,7 @@ class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
|
||||
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_non_padded = model.generate(input_ids=inputs_non_padded)
|
||||
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
|
||||
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
|
||||
|
||||
|
||||
@@ -406,7 +406,7 @@ class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
|
||||
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_non_padded = model.generate(input_ids=inputs_non_padded)
|
||||
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
|
||||
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
|
||||
|
||||
|
||||
@@ -352,7 +352,7 @@ class Data2VecVisionModelIntegrationTest(unittest.TestCase):
|
||||
torch.testing.assert_close(logits[0, :3], expected_slice, rtol=1e-4, atol=1e-4)
|
||||
|
||||
expected_top2 = [model.config.label2id[i] for i in ["remote control, remote", "tabby, tabby cat"]]
|
||||
self.assertEqual(logits[0].topk(2).indices.cpu().tolist(), expected_top2)
|
||||
self.assertEqual(logits[0].topk(2).indices.tolist(), expected_top2)
|
||||
|
||||
@slow
|
||||
def test_inference_interpolate_pos_encoding(self):
|
||||
|
||||
@@ -117,7 +117,7 @@ class EncodecModelTester:
|
||||
config.normalize = True
|
||||
|
||||
processor = EncodecFeatureExtractor(feature_size=config.audio_channels, sampling_rate=config.sampling_rate)
|
||||
input_values = list(input_values.cpu().numpy())
|
||||
input_values = input_values.tolist()
|
||||
inputs_dict = processor(
|
||||
input_values, sampling_rate=config.sampling_rate, padding=True, return_tensors="pt"
|
||||
).to(torch_device)
|
||||
@@ -495,7 +495,7 @@ class EncodecIntegrationTest(unittest.TestCase):
|
||||
# use max bandwidth for best possible reconstruction
|
||||
encoder_outputs = model.encode(inputs["input_values"], bandwidth=float(bandwidth))
|
||||
|
||||
audio_code_sums = [a[0].sum().cpu().item() for a in encoder_outputs[0]]
|
||||
audio_code_sums = [a[0].sum().item() for a in encoder_outputs[0]]
|
||||
|
||||
# make sure audio encoded codes are correct
|
||||
self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
|
||||
@@ -552,7 +552,7 @@ class EncodecIntegrationTest(unittest.TestCase):
|
||||
encoder_outputs = model.encode(
|
||||
inputs["input_values"], inputs["padding_mask"], bandwidth=float(bandwidth), return_dict=False
|
||||
)
|
||||
audio_code_sums = [a[0].sum().cpu().item() for a in encoder_outputs[0]]
|
||||
audio_code_sums = [a[0].sum().item() for a in encoder_outputs[0]]
|
||||
|
||||
# make sure audio encoded codes are correct
|
||||
self.assertListEqual(audio_code_sums, expected_codesums[bandwidth])
|
||||
@@ -610,8 +610,8 @@ class EncodecIntegrationTest(unittest.TestCase):
|
||||
with torch.no_grad():
|
||||
# use max bandwidth for best possible reconstruction
|
||||
encoder_outputs = model.encode(input_values, bandwidth=float(bandwidth), return_dict=False)
|
||||
audio_code_sums_0 = [a[0][0].sum().cpu().item() for a in encoder_outputs[0]]
|
||||
audio_code_sums_1 = [a[0][1].sum().cpu().item() for a in encoder_outputs[0]]
|
||||
audio_code_sums_0 = [a[0][0].sum().item() for a in encoder_outputs[0]]
|
||||
audio_code_sums_1 = [a[0][1].sum().item() for a in encoder_outputs[0]]
|
||||
|
||||
# make sure audio encoded codes are correct
|
||||
self.assertListEqual(audio_code_sums_0, expected_codesums[bandwidth][0])
|
||||
|
||||
@@ -662,7 +662,7 @@ class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
|
||||
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_non_padded = model.generate(input_ids=inputs_non_padded, max_length=20)
|
||||
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
|
||||
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
|
||||
|
||||
@@ -724,7 +724,7 @@ class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
|
||||
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_non_padded = model.generate(input_ids=inputs_non_padded, max_length=20)
|
||||
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
|
||||
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
|
||||
|
||||
|
||||
@@ -552,7 +552,7 @@ class GPTNeoModelLanguageGenerationTest(unittest.TestCase):
|
||||
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_non_padded = model.generate(input_ids=inputs_non_padded)
|
||||
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
|
||||
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
|
||||
|
||||
|
||||
@@ -466,7 +466,7 @@ class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
|
||||
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_non_padded = model.generate(input_ids=inputs_non_padded)
|
||||
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
|
||||
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
|
||||
|
||||
|
||||
@@ -540,7 +540,7 @@ class MimiIntegrationTest(unittest.TestCase):
|
||||
# use max bandwidth for best possible reconstruction
|
||||
encoder_outputs = model.encode(inputs["input_values"], num_quantizers=int(num_codebooks))
|
||||
|
||||
audio_code_sums = encoder_outputs[0].sum().cpu().item()
|
||||
audio_code_sums = encoder_outputs[0].sum().item()
|
||||
|
||||
# make sure audio encoded codes are correct
|
||||
# assert relative difference less than a threshold, because `audio_code_sums` varies a bit
|
||||
|
||||
@@ -951,8 +951,8 @@ class MoshiIntegrationTests(unittest.TestCase):
|
||||
expected_text_token = 452
|
||||
expected_audio_tokens = [916, 1396, 1238, 579, 1105, 914, 1257, 810] # fmt: skip
|
||||
|
||||
self.assertTrue(expected_text_token == model_outputs.sequences[0, -2].cpu().item())
|
||||
self.assertTrue(expected_audio_tokens == model_outputs.audio_codes[0, :, -1].cpu().tolist())
|
||||
self.assertTrue(expected_text_token == model_outputs.sequences[0, -2].item())
|
||||
self.assertTrue(expected_audio_tokens == model_outputs.audio_codes[0, :, -1].tolist())
|
||||
|
||||
@slow
|
||||
def test_moshiko_greedy_unconditional_fp16_eager(self):
|
||||
@@ -966,7 +966,7 @@ class MoshiIntegrationTests(unittest.TestCase):
|
||||
)
|
||||
|
||||
# eager equivalence is not as strict as sdpa.
|
||||
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
|
||||
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())
|
||||
|
||||
@slow
|
||||
def test_moshiko_greedy_unconditional_fp32(self):
|
||||
@@ -986,8 +986,8 @@ class MoshiIntegrationTests(unittest.TestCase):
|
||||
audio_code_sums = model_outputs.audio_codes.sum().item()
|
||||
self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= (3e-3 * audio_code_sums))
|
||||
|
||||
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist())
|
||||
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
|
||||
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].tolist())
|
||||
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())
|
||||
|
||||
@slow
|
||||
@require_torch_fp16
|
||||
@@ -1008,8 +1008,8 @@ class MoshiIntegrationTests(unittest.TestCase):
|
||||
audio_code_sums = model_outputs.audio_codes.sum().item()
|
||||
self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= (3e-3 * audio_code_sums))
|
||||
|
||||
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist())
|
||||
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
|
||||
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].tolist())
|
||||
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())
|
||||
|
||||
@slow
|
||||
@require_torch_fp16
|
||||
@@ -1030,5 +1030,5 @@ class MoshiIntegrationTests(unittest.TestCase):
|
||||
audio_code_sums = model_outputs.audio_codes.sum().item()
|
||||
self.assertTrue(np.abs(audio_code_sums - expected_audio_codesum) <= 2048)
|
||||
|
||||
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].cpu().tolist())
|
||||
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].cpu().tolist())
|
||||
self.assertTrue(expected_text_tokens == model_outputs.sequences[0, 1:].tolist())
|
||||
self.assertTrue(some_expected_audio_tokens == model_outputs.audio_codes[0, :, :2].tolist())
|
||||
|
||||
@@ -486,7 +486,7 @@ class OPTGenerationTest(unittest.TestCase):
|
||||
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_non_padded = model.generate(input_ids=inputs_non_padded)
|
||||
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
|
||||
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
|
||||
|
||||
|
||||
@@ -989,7 +989,7 @@ class Owlv2ModelIntegrationTest(unittest.TestCase):
|
||||
outputs, text_labels=text_labels
|
||||
)
|
||||
|
||||
objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
|
||||
objects_labels = post_processed_output_with_text_labels[0]["labels"].tolist()
|
||||
self.assertListEqual(objects_labels, [0, 0])
|
||||
|
||||
objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
|
||||
|
||||
@@ -975,7 +975,7 @@ class OwlViTModelIntegrationTest(unittest.TestCase):
|
||||
outputs, text_labels=text_labels
|
||||
)
|
||||
|
||||
objects_labels = post_processed_output_with_text_labels[0]["labels"].cpu().tolist()
|
||||
objects_labels = post_processed_output_with_text_labels[0]["labels"].tolist()
|
||||
self.assertListEqual(objects_labels, [0, 0])
|
||||
|
||||
objects_text_labels = post_processed_output_with_text_labels[0]["text_labels"]
|
||||
|
||||
@@ -311,7 +311,7 @@ class RagTestMixin:
|
||||
|
||||
out = retriever(
|
||||
input_ids,
|
||||
question_hidden_states.cpu().detach().to(torch.float32).numpy(),
|
||||
question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
|
||||
prefix=config.generator.prefix,
|
||||
return_tensors="pt",
|
||||
)
|
||||
@@ -379,7 +379,7 @@ class RagTestMixin:
|
||||
|
||||
out = retriever(
|
||||
input_ids,
|
||||
question_hidden_states.cpu().detach().to(torch.float32).numpy(),
|
||||
question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
|
||||
prefix=config.generator.prefix,
|
||||
return_tensors="pt",
|
||||
)
|
||||
@@ -438,7 +438,7 @@ class RagTestMixin:
|
||||
|
||||
out = retriever(
|
||||
input_ids,
|
||||
question_hidden_states.cpu().detach().to(torch.float32).numpy(),
|
||||
question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
|
||||
prefix=config.generator.prefix,
|
||||
return_tensors="pt",
|
||||
n_docs=n_docs,
|
||||
@@ -507,7 +507,7 @@ class RagTestMixin:
|
||||
|
||||
out = retriever(
|
||||
input_ids,
|
||||
question_hidden_states.cpu().detach().to(torch.float32).numpy(),
|
||||
question_hidden_states.detach().to(device="cpu", dtype=torch.float32).numpy(),
|
||||
prefix=config.generator.prefix,
|
||||
return_tensors="pt",
|
||||
n_docs=retriever_n_docs,
|
||||
@@ -964,7 +964,7 @@ class RagModelIntegrationTests(unittest.TestCase):
|
||||
|
||||
question_hidden_states = rag_sequence.question_encoder(input_ids, attention_mask=attention_mask)[0]
|
||||
docs_dict = retriever(
|
||||
input_ids.cpu().detach().numpy(), question_hidden_states.cpu().detach().numpy(), return_tensors="pt"
|
||||
input_ids.detach().cpu().numpy(), question_hidden_states.detach().cpu().numpy(), return_tensors="pt"
|
||||
)
|
||||
doc_scores = torch.bmm(
|
||||
question_hidden_states.unsqueeze(1),
|
||||
|
||||
@@ -1044,7 +1044,7 @@ class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
).loss
|
||||
|
||||
# loss_more_masked has to be bigger or equal loss since more masked inputs have to be predicted
|
||||
self.assertTrue(loss.detach().item() <= loss_more_masked.detach().item())
|
||||
self.assertTrue(loss.item() <= loss_more_masked.item())
|
||||
|
||||
def test_mask_feature_prob_ctc(self):
|
||||
model = Wav2Vec2ForCTC.from_pretrained(
|
||||
|
||||
@@ -670,7 +670,7 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
#
|
||||
# input_values = feature_extractor(sample["audio"]["array"], return_tensors="pt").input_values
|
||||
# logits = model(input_values).logits
|
||||
# pred_ids = torch.argmax(logits, axis=-1).cpu().tolist()
|
||||
# pred_ids = torch.argmax(logits, axis=-1).tolist()
|
||||
# ```
|
||||
# fmt: off
|
||||
pred_ids = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 11, 0, 0, 0, 22, 0, 0, 4, 4, 4, 14, 0, 0, 0, 0, 0, 8, 8, 0, 5, 5, 0, 12, 0, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 10, 0, 0, 0, 15, 0, 0, 10, 0, 0, 0, 12, 0, 0, 0, 0, 0, 7, 0, 9, 0, 0, 14, 0, 0, 0, 13, 0, 7, 0, 0, 4, 4, 0, 15, 8, 8, 0, 0, 8, 0, 26, 0, 0, 4, 4, 0, 0, 15, 0, 0, 0, 0, 0, 0, 10, 0, 26, 5, 5, 0, 4, 4, 0, 0, 12, 11, 0, 0, 5, 4, 4, 4, 0, 18, 0, 0, 0, 7, 9, 9, 0, 6, 0, 12, 12, 4, 4, 0, 6, 0, 0, 8, 0, 4, 4, 4, 0, 19, 0, 0, 8, 9, 9, 0, 0, 0, 0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 16, 16, 0, 0, 17, 5, 5, 5, 0, 4, 4, 4, 0, 0, 29, 29, 0, 0, 0, 0, 8, 11, 0, 9, 9, 0, 0, 0, 4, 4, 0, 12, 12, 0, 0, 0, 9, 0, 0, 0, 0, 0, 8, 18, 0, 0, 0, 4, 4, 0, 0, 8, 9, 0, 4, 4, 0, 6, 11, 5, 0, 4, 4, 0, 13, 13, 0, 0, 0, 10, 0, 0, 25, 0, 0, 6, 0, 4, 4, 0, 0, 0, 0, 7, 0, 0, 23, 0, 0, 4, 4, 0, 0, 0, 6, 11, 0, 5, 4, 4, 18, 0, 0, 0, 0, 0, 0, 7, 15, 0, 0, 0, 15, 15, 0, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
|
||||
|
||||
@@ -527,4 +527,4 @@ class XLMModelLanguageGenerationTest(unittest.TestCase):
|
||||
] # the president the president the president the president the president the president the president the president the president the president
|
||||
# TODO(PVP): this and other input_ids I tried for generation give pretty bad results. Not sure why. Model might just not be made for auto-regressive inference
|
||||
output_ids = model.generate(input_ids, do_sample=False)
|
||||
self.assertListEqual(output_ids[0].cpu().numpy().tolist(), expected_output_ids)
|
||||
self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
|
||||
|
||||
Reference in New Issue
Block a user