Sync video classification pipeline with huggingface_hub spec (#34288)
* Sync video classification pipeline * Add disclaimer
This commit is contained in:
@@ -1,3 +1,17 @@
|
|||||||
|
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import warnings
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
||||||
@@ -42,7 +56,7 @@ class VideoClassificationPipeline(Pipeline):
|
|||||||
requires_backends(self, "av")
|
requires_backends(self, "av")
|
||||||
self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES)
|
self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES)
|
||||||
|
|
||||||
def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None):
|
def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None, function_to_apply=None):
|
||||||
preprocess_params = {}
|
preprocess_params = {}
|
||||||
if frame_sampling_rate is not None:
|
if frame_sampling_rate is not None:
|
||||||
preprocess_params["frame_sampling_rate"] = frame_sampling_rate
|
preprocess_params["frame_sampling_rate"] = frame_sampling_rate
|
||||||
@@ -52,14 +66,23 @@ class VideoClassificationPipeline(Pipeline):
|
|||||||
postprocess_params = {}
|
postprocess_params = {}
|
||||||
if top_k is not None:
|
if top_k is not None:
|
||||||
postprocess_params["top_k"] = top_k
|
postprocess_params["top_k"] = top_k
|
||||||
|
if function_to_apply is not None:
|
||||||
|
if function_to_apply not in ["softmax", "sigmoid", "none"]:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid value for `function_to_apply`: {function_to_apply}. "
|
||||||
|
"Valid options are ['softmax', 'sigmoid', 'none']"
|
||||||
|
)
|
||||||
|
postprocess_params["function_to_apply"] = function_to_apply
|
||||||
|
else:
|
||||||
|
postprocess_params["function_to_apply"] = "softmax"
|
||||||
return preprocess_params, {}, postprocess_params
|
return preprocess_params, {}, postprocess_params
|
||||||
|
|
||||||
def __call__(self, videos: Union[str, List[str]], **kwargs):
|
def __call__(self, inputs: Union[str, List[str]] = None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Assign labels to the video(s) passed as inputs.
|
Assign labels to the video(s) passed as inputs.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
videos (`str`, `List[str]`):
|
inputs (`str`, `List[str]`):
|
||||||
The pipeline handles three types of videos:
|
The pipeline handles three types of videos:
|
||||||
|
|
||||||
- A string containing a http link pointing to a video
|
- A string containing a http link pointing to a video
|
||||||
@@ -76,6 +99,11 @@ class VideoClassificationPipeline(Pipeline):
|
|||||||
frame_sampling_rate (`int`, *optional*, defaults to 1):
|
frame_sampling_rate (`int`, *optional*, defaults to 1):
|
||||||
The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every
|
The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every
|
||||||
frame will be used.
|
frame will be used.
|
||||||
|
function_to_apply(`str`, *optional*, defaults to "softmax"):
|
||||||
|
The function to apply to the model output. By default, the pipeline will apply the softmax function to
|
||||||
|
the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
|
||||||
|
built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
|
||||||
|
post-processing.
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
A dictionary or a list of dictionaries containing result. If the input is a single video, will return a
|
A dictionary or a list of dictionaries containing result. If the input is a single video, will return a
|
||||||
@@ -87,7 +115,16 @@ class VideoClassificationPipeline(Pipeline):
|
|||||||
- **label** (`str`) -- The label identified by the model.
|
- **label** (`str`) -- The label identified by the model.
|
||||||
- **score** (`int`) -- The score attributed by the model for that label.
|
- **score** (`int`) -- The score attributed by the model for that label.
|
||||||
"""
|
"""
|
||||||
return super().__call__(videos, **kwargs)
|
# After deprecation of this is completed, remove the default `None` value for `images`
|
||||||
|
if "videos" in kwargs:
|
||||||
|
warnings.warn(
|
||||||
|
"The `videos` argument has been renamed to `inputs`. In version 5 of Transformers, `videos` will no longer be accepted",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
inputs = kwargs.pop("videos")
|
||||||
|
if inputs is None:
|
||||||
|
raise ValueError("Cannot call the video-classification pipeline without an inputs argument!")
|
||||||
|
return super().__call__(inputs, **kwargs)
|
||||||
|
|
||||||
def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
|
def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
|
||||||
if num_frames is None:
|
if num_frames is None:
|
||||||
@@ -114,12 +151,17 @@ class VideoClassificationPipeline(Pipeline):
|
|||||||
model_outputs = self.model(**model_inputs)
|
model_outputs = self.model(**model_inputs)
|
||||||
return model_outputs
|
return model_outputs
|
||||||
|
|
||||||
def postprocess(self, model_outputs, top_k=5):
|
def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"):
|
||||||
if top_k > self.model.config.num_labels:
|
if top_k > self.model.config.num_labels:
|
||||||
top_k = self.model.config.num_labels
|
top_k = self.model.config.num_labels
|
||||||
|
|
||||||
if self.framework == "pt":
|
if self.framework == "pt":
|
||||||
probs = model_outputs.logits.softmax(-1)[0]
|
if function_to_apply == "softmax":
|
||||||
|
probs = model_outputs.logits[0].softmax(-1)
|
||||||
|
elif function_to_apply == "sigmoid":
|
||||||
|
probs = model_outputs.logits[0].sigmoid()
|
||||||
|
else:
|
||||||
|
probs = model_outputs.logits[0]
|
||||||
scores, ids = probs.topk(top_k)
|
scores, ids = probs.topk(top_k)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported framework: {self.framework}")
|
raise ValueError(f"Unsupported framework: {self.framework}")
|
||||||
|
|||||||
@@ -14,11 +14,12 @@
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import VideoClassificationOutputElement, hf_hub_download
|
||||||
|
|
||||||
from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor
|
from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor
|
||||||
from transformers.pipelines import VideoClassificationPipeline, pipeline
|
from transformers.pipelines import VideoClassificationPipeline, pipeline
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
|
compare_pipeline_output_to_hub_spec,
|
||||||
is_pipeline_test,
|
is_pipeline_test,
|
||||||
nested_simplify,
|
nested_simplify,
|
||||||
require_av,
|
require_av,
|
||||||
@@ -76,6 +77,8 @@ class VideoClassificationPipelineTests(unittest.TestCase):
|
|||||||
{"score": ANY(float), "label": ANY(str)},
|
{"score": ANY(float), "label": ANY(str)},
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
for element in outputs:
|
||||||
|
compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement)
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
def test_small_model_pt(self):
|
def test_small_model_pt(self):
|
||||||
@@ -93,6 +96,9 @@ class VideoClassificationPipelineTests(unittest.TestCase):
|
|||||||
nested_simplify(outputs, decimals=4),
|
nested_simplify(outputs, decimals=4),
|
||||||
[{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
|
[{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
|
||||||
)
|
)
|
||||||
|
for output in outputs:
|
||||||
|
for element in output:
|
||||||
|
compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement)
|
||||||
|
|
||||||
outputs = video_classifier(
|
outputs = video_classifier(
|
||||||
[
|
[
|
||||||
@@ -108,6 +114,9 @@ class VideoClassificationPipelineTests(unittest.TestCase):
|
|||||||
[{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
|
[{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
for output in outputs:
|
||||||
|
for element in output:
|
||||||
|
compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement)
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
@unittest.skip
|
@unittest.skip
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ from huggingface_hub import (
|
|||||||
ImageToTextInput,
|
ImageToTextInput,
|
||||||
ObjectDetectionInput,
|
ObjectDetectionInput,
|
||||||
QuestionAnsweringInput,
|
QuestionAnsweringInput,
|
||||||
|
VideoClassificationInput,
|
||||||
ZeroShotImageClassificationInput,
|
ZeroShotImageClassificationInput,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -47,6 +48,7 @@ from transformers.pipelines import (
|
|||||||
ImageToTextPipeline,
|
ImageToTextPipeline,
|
||||||
ObjectDetectionPipeline,
|
ObjectDetectionPipeline,
|
||||||
QuestionAnsweringPipeline,
|
QuestionAnsweringPipeline,
|
||||||
|
VideoClassificationPipeline,
|
||||||
ZeroShotImageClassificationPipeline,
|
ZeroShotImageClassificationPipeline,
|
||||||
)
|
)
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
@@ -132,6 +134,7 @@ task_to_pipeline_and_spec_mapping = {
|
|||||||
"image-to-text": (ImageToTextPipeline, ImageToTextInput),
|
"image-to-text": (ImageToTextPipeline, ImageToTextInput),
|
||||||
"object-detection": (ObjectDetectionPipeline, ObjectDetectionInput),
|
"object-detection": (ObjectDetectionPipeline, ObjectDetectionInput),
|
||||||
"question-answering": (QuestionAnsweringPipeline, QuestionAnsweringInput),
|
"question-answering": (QuestionAnsweringPipeline, QuestionAnsweringInput),
|
||||||
|
"video-classification": (VideoClassificationPipeline, VideoClassificationInput),
|
||||||
"zero-shot-image-classification": (ZeroShotImageClassificationPipeline, ZeroShotImageClassificationInput),
|
"zero-shot-image-classification": (ZeroShotImageClassificationPipeline, ZeroShotImageClassificationInput),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user