Add torchcodec in docstrings/tests for datasets 4.0 (#39156)

* fix dataset run_object_detection

* bump version

* keep same dataset actually

* torchcodec in docstrings and testing utils

* torchcodec in dockerfiles and requirements

* remove duplicate

* add torchocodec to all the remaining docker files

* fix tests

* support torchcodec in audio classification and ASR

* [commit to revert] build ci-dev images

* [commit to revert] trigger circleci

* [commit to revert] build ci-dev images

* fix

* fix modeling_hubert

* backward compatible run_object_detection

* revert ci trigger commits

* fix mono conversion and support torch tensor as input

* revert map_to_array docs + fix it

* revert mono

* nit in docstring

* style

* fix modular

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Quentin Lhoest
2025-07-08 17:06:12 +02:00
committed by GitHub
parent 1255480fd2
commit 1ecd52e50a
78 changed files with 448 additions and 350 deletions

View File

@@ -207,7 +207,7 @@ class Phi4MultimodalFeatureExtractionTest(SequenceFeatureExtractionTestMixin, un
def _load_datasamples(self, num_samples):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
speech_samples = ds.sort("id")[:num_samples]["audio"]
return [x["array"] for x in speech_samples]

View File

@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import tempfile
import unittest
import requests
@@ -33,13 +32,13 @@ from transformers import (
from transformers.testing_utils import (
Expectations,
cleanup,
require_soundfile,
require_torch,
require_torch_large_accelerator,
require_torchcodec,
slow,
torch_device,
)
from transformers.utils import is_soundfile_available
from transformers.utils import is_torchcodec_available
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
@@ -54,8 +53,8 @@ if is_vision_available():
from PIL import Image
if is_soundfile_available():
import soundfile
if is_torchcodec_available():
import torchcodec
class Phi4MultimodalModelTester:
@@ -296,11 +295,9 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
self.assistant_token = "<|assistant|>"
self.end_token = "<|end|>"
self.image = Image.open(requests.get(self.image_url, stream=True).raw)
with tempfile.NamedTemporaryFile(mode="w+b", suffix=".wav") as tmp:
tmp.write(requests.get(self.audio_url, stream=True).raw.data)
tmp.flush()
tmp.seek(0)
self.audio, self.sampling_rate = soundfile.read(tmp.name)
audio_bytes = requests.get(self.audio_url, stream=True).raw.data
samples = torchcodec.decoders.AudioDecoder(audio_bytes).get_all_samples()
self.audio, self.sampling_rate = samples.data, samples.sample_rate
cleanup(torch_device, gc_collect=True)
@@ -378,7 +375,7 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase):
self.assertEqual(response, EXPECTED_RESPONSE)
@require_soundfile
@require_torchcodec
def test_audio_text_generation(self):
model = AutoModelForCausalLM.from_pretrained(
self.checkpoint_path, revision=self.revision, torch_dtype=torch.float16, device_map=torch_device