Add torchcodec in docstrings/tests for datasets 4.0 (#39156)

* fix dataset run_object_detection

* bump version

* keep same dataset actually

* torchcodec in docstrings and testing utils

* torchcodec in dockerfiles and requirements

* remove duplicate

* add torchocodec to all the remaining docker files

* fix tests

* support torchcodec in audio classification and ASR

* [commit to revert] build ci-dev images

* [commit to revert] trigger circleci

* [commit to revert] build ci-dev images

* fix

* fix modeling_hubert

* backward compatible run_object_detection

* revert ci trigger commits

* fix mono conversion and support torch tensor as input

* revert map_to_array docs + fix it

* revert mono

* nit in docstring

* style

* fix modular

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Quentin Lhoest
2025-07-08 17:06:12 +02:00
committed by GitHub
parent 1255480fd2
commit 1ecd52e50a
78 changed files with 448 additions and 350 deletions

View File

@@ -145,7 +145,7 @@ class DiaFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
audio_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in audio_samples]

View File

@@ -665,8 +665,12 @@ class DiaForConditionalGenerationIntegrationTest(unittest.TestCase):
@require_torch_accelerator
def test_dia_model_integration_generate_audio_context(self):
text = ["[S1] Dia is an open weights text to dialogue model.", "This is a test"]
audio_sample_1 = torchaudio.load(self.audio_prompt_1_path, channels_first=True)[0].squeeze().numpy()
audio_sample_2 = torchaudio.load(self.audio_prompt_2_path, channels_first=True)[0].squeeze().numpy()
audio_sample_1 = (
torchaudio.load(self.audio_prompt_1_path, channels_first=True, backend="soundfile")[0].squeeze().numpy()
)
audio_sample_2 = (
torchaudio.load(self.audio_prompt_2_path, channels_first=True, backend="soundfile")[0].squeeze().numpy()
)
audio = [audio_sample_1, audio_sample_2]
processor = DiaProcessor.from_pretrained(self.model_checkpoint)