Add torchcodec in docstrings/tests for datasets 4.0 (#39156)

* fix dataset run_object_detection * bump version * keep same dataset actually * torchcodec in docstrings and testing utils * torchcodec in dockerfiles and requirements * remove duplicate * add torchocodec to all the remaining docker files * fix tests * support torchcodec in audio classification and ASR * [commit to revert] build ci-dev images * [commit to revert] trigger circleci * [commit to revert] build ci-dev images * fix * fix modeling_hubert * backward compatible run_object_detection * revert ci trigger commits * fix mono conversion and support torch tensor as input * revert map_to_array docs + fix it * revert mono * nit in docstring * style * fix modular --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-07-08 17:06:12 +02:00
parent 1255480fd2
commit 1ecd52e50a
78 changed files with 448 additions and 350 deletions
--- a/examples/pytorch/_tests_requirements.txt
+++ b/examples/pytorch/_tests_requirements.txt
@@ -22,6 +22,7 @@ protobuf
 torch
 torchvision
 torchaudio
+torchcodec
 jiwer
 librosa
 evaluate >= 0.2.0
--- a/examples/pytorch/object-detection/requirements.txt
+++ b/examples/pytorch/object-detection/requirements.txt
@@ -1,5 +1,5 @@
 albumentations >= 1.4.16
 timm
-datasets
+datasets>=4.0
 torchmetrics
 pycocotools
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@@ -399,7 +399,10 @@ def main():
        dataset["validation"] = split["test"]

    # Get dataset categories and prepare mappings for label_name <-> label_id
-    categories = dataset["train"].features["objects"].feature["category"].names
+    if isinstance(dataset["train"].features["objects"], dict):
+        categories = dataset["train"].features["objects"]["category"].feature.names
+    else:  # (for old versions of `datasets` that used Sequence({...}) of the objects)
+        categories = dataset["train"].features["objects"].feature["category"].names
    id2label = dict(enumerate(categories))
    label2id = {v: k for k, v in id2label.items()}

--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@@ -460,7 +460,10 @@ def main():
        dataset["validation"] = split["test"]

    # Get dataset categories and prepare mappings for label_name <-> label_id
-    categories = dataset["train"].features["objects"].feature["category"].names
+    if isinstance(dataset["train"].features["objects"], dict):
+        categories = dataset["train"].features["objects"]["category"].feature.names
+    else:  # (for old versions of `datasets` that used Sequence({...}) of the objects)
+        categories = dataset["train"].features["objects"].feature["category"].names
    id2label = dict(enumerate(categories))
    label2id = {v: k for k, v in id2label.items()}