[CLAP] Support batched inputs for CLAP. Fixes pipeline issues (#21931)

* fix pipeline

* fix feature_extraction clap

* you can now batch the `is_longer` attribute

* add tests

* fixup

* add expected scores

* comment on is_longert
This commit is contained in:
Arthur
2023-03-03 18:42:18 +01:00
committed by GitHub
parent c5fe06c59d
commit 718e9d777f
3 changed files with 59 additions and 2 deletions

View File

@@ -665,3 +665,55 @@ class ClapModelIntegrationTest(unittest.TestCase):
self.assertTrue(
torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
)
def test_batched_fused(self):
EXPECTED_MEANS_FUSED = {
"repeatpad": 0.0010,
"repeat": 0.0020,
"pad": 0.0006,
}
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
model_id = "laion/clap-htsat-fused"
model = ClapModel.from_pretrained(model_id).to(torch_device)
processor = ClapProcessor.from_pretrained(model_id)
for padding in self.paddings:
inputs = processor(audios=audio_samples, return_tensors="pt", padding=padding, truncation="fusion").to(
torch_device
)
audio_embed = model.get_audio_features(**inputs)
expected_mean = EXPECTED_MEANS_FUSED[padding]
self.assertTrue(
torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
)
def test_batched_unfused(self):
EXPECTED_MEANS_FUSED = {
"repeatpad": 0.0016,
"repeat": 0.0019,
"pad": 0.0019,
}
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]]
model_id = "laion/clap-htsat-unfused"
model = ClapModel.from_pretrained(model_id).to(torch_device)
processor = ClapProcessor.from_pretrained(model_id)
for padding in self.paddings:
inputs = processor(audios=audio_samples, return_tensors="pt", padding=padding).to(torch_device)
audio_embed = model.get_audio_features(**inputs)
expected_mean = EXPECTED_MEANS_FUSED[padding]
self.assertTrue(
torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
)