[chat templates} support loading audio from video (#36955)
* add audio from video * typos * delete print * comments
This commit is contained in:
committed by
GitHub
parent
c7bc79bd2a
commit
e97c760006
@@ -1097,10 +1097,7 @@ class ProcessorTesterMixin:
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "video",
|
||||
"path": video_file_path,
|
||||
},
|
||||
{"type": "video", "path": video_file_path},
|
||||
{"type": "text", "text": "What is shown in this video?"},
|
||||
],
|
||||
},
|
||||
@@ -1189,6 +1186,70 @@ class ProcessorTesterMixin:
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 1)
|
||||
self.assertEqual(len(out_dict_with_video[self.videos_input_name][0]), 243)
|
||||
|
||||
@require_librosa
|
||||
@require_av
|
||||
def test_audio_chat_template_from_video(self):
|
||||
processor = self.get_processor()
|
||||
if processor.chat_template is None:
|
||||
self.skipTest("Processor has no chat template")
|
||||
|
||||
signature = inspect.signature(processor.__call__)
|
||||
if "videos" not in {*signature.parameters.keys()} or (
|
||||
signature.parameters.get("videos") is not None
|
||||
and signature.parameters["videos"].annotation == inspect._empty
|
||||
):
|
||||
self.skipTest(f"{self.processor_class} does not suport video inputs")
|
||||
|
||||
if "feature_extractor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
|
||||
|
||||
video_file_path = hf_hub_download(
|
||||
repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset"
|
||||
)
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video", "path": video_file_path},
|
||||
{"type": "text", "text": "Which of these animals is making the sound?"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": "It is a cow."}],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "audio",
|
||||
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3",
|
||||
},
|
||||
{"type": "text", "text": "Is it the same sound?"},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
formatted_prompt = processor.apply_chat_template([messages], add_generation_prompt=True, tokenize=False)
|
||||
self.assertEqual(len(formatted_prompt), 1) # batch size=1
|
||||
|
||||
out_dict = processor.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
return_tensors="np",
|
||||
load_audio_from_video=True,
|
||||
)
|
||||
self.assertTrue(self.audio_input_name in out_dict)
|
||||
self.assertTrue(self.video_input_name in out_dict)
|
||||
|
||||
# should always have input_ids and attention_mask
|
||||
self.assertEqual(len(out_dict["input_ids"]), 1) # batch-size=1
|
||||
self.assertEqual(len(out_dict["attention_mask"]), 1) # batch-size=1
|
||||
self.assertEqual(len(out_dict[self.audio_input_name]), 2) # 2 audios in the conversation
|
||||
self.assertEqual(len(out_dict[self.video_input_name]), 1) # 1 video in the conversation
|
||||
|
||||
@require_librosa
|
||||
def test_audio_chat_template_single(self):
|
||||
processor = self.get_processor()
|
||||
|
||||
Reference in New Issue
Block a user