Add TVLT (#20725)
* Update image_processing_tvlt.py * Update modeling_tvlt.py * Update * Update modeling_tvlt.py * Create tvlt.mdx * Update configuration_tvlt.py * Update modeling_tvlt.py * Update test_modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update image_processing_tvlt.py * Update feature_extraction_tvlt.py * Update tvlt models * Update tests * Update * Update * Update tests * Update README_ko.md * Update README_ja.md * Update README_ko.md * Update README_zh-hans.md * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update tvlt.mdx * Update modeling_tvlt.py * Update configuration_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Add files via upload * Update model * Update modeling_tvlt.py * Update tvlt models * Update src/transformers/models/tvlt/__init__.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/__init__.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Add files via upload * Add files via upload * Delete modeling_tvlt.py * Delete feature_extraction_tvlt.py * Delete configuration_tvlt.py * Delete image_processing_tvlt.py * Delete processing_tvlt.py * Update tvlt * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update README.md Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update README_es.md * Update README_hd.md * Update README_ja.md * Update README_ko.md * Update README_zh-hans.md * Update README_zh-hant.md * Update index.mdx * Update tvlt.mdx * Update tvlt.mdx * Update configuration_tvlt.py * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update modeling_tvlt.py * Add files via upload * Update tvlt.mdx * Update modeling_auto.py * Add files via upload * Add files via upload * Update dummy_pt_objects.py * Update __init__.py * Update feature_extraction_tvlt.py * Update feature_extraction_tvlt.py * Update image_processing_tvlt.py * Update modeling_auto.py * Update test_feature_extraction_tvlt.py * Update test_processor_tvlt.py * Update test_feature_extraction_tvlt.py * Add files via upload * Update test_image_processor_tvlt.py * Update tests/models/tvlt/test_processor_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_image_processor_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update tests/models/tvlt/test_image_processor_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_image_processor_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_image_processor_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update feature_extraction_tvlt.py * Update feature_extraction_tvlt.py * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update image_processing_tvlt.py * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update test_image_processor_tvlt.py * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/tvlt/test_modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Add files via upload * Add files via upload * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Add files via upload * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update image_processing_tvlt.py * Add files via upload * Add files via upload * Update tvlt.mdx * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * Update docs/source/en/model_doc/tvlt.mdx Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Update modeling_auto.py * Update tvlt.mdx * Update dummy_pt_objects.py * Update feature_extraction_tvlt.py * Update modeling_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_image_processor_tvlt.py * Update test_feature_extraction_tvlt.py * Update modeling_tvlt.py * Update dummy_pt_objects.py * Update dummy_speech_objects.py * Add files via upload * Update README_hd.md * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling_tvlt.py * Update test_modeling_tvlt.py * Update src/transformers/models/tvlt/configuration_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/feature_extraction_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/image_processing_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update MAE processing * Update modeling_tvlt.py * Update modeling_tvlt.py * Update modeling * Update style * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/tvlt/modeling_tvlt.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update check_repo.py * Update tvlt.mdx * Update __init__.py * Update tests * Update tvlt models * Update configuration_tvlt.py * Update configuration_tvlt.py * Update image_processing_tvlt.py * Update dummy_pt_objects.py * Add files via upload * Update test_modeling_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_feature_extraction_tvlt.py * Update test_feature_extraction_tvlt.py --------- Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
This commit is contained in:
0
tests/models/tvlt/__init__.py
Normal file
0
tests/models/tvlt/__init__.py
Normal file
207
tests/models/tvlt/test_feature_extraction_tvlt.py
Normal file
207
tests/models/tvlt/test_feature_extraction_tvlt.py
Normal file
@@ -0,0 +1,207 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the TVLT feature extraction. """
|
||||
|
||||
import itertools
|
||||
import os
|
||||
import random
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers import is_datasets_available, is_speech_available
|
||||
from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio
|
||||
from transformers.utils.import_utils import is_torch_available
|
||||
|
||||
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_datasets_available():
|
||||
from datasets import load_dataset
|
||||
|
||||
if is_speech_available():
|
||||
from transformers import TvltFeatureExtractor
|
||||
|
||||
global_rng = random.Random()
|
||||
|
||||
|
||||
def floats_list(shape, scale=1.0, rng=None, name=None):
|
||||
"""Creates a random float32 tensor"""
|
||||
if rng is None:
|
||||
rng = global_rng
|
||||
|
||||
values = []
|
||||
for batch_idx in range(shape[0]):
|
||||
values.append([])
|
||||
for _ in range(shape[1]):
|
||||
values[-1].append(rng.random() * scale)
|
||||
|
||||
return values
|
||||
|
||||
|
||||
class TvltFeatureExtractionTester(unittest.TestCase):
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=7,
|
||||
min_seq_length=400,
|
||||
max_seq_length=2000,
|
||||
spectrogram_length=2048,
|
||||
feature_size=128,
|
||||
num_audio_channels=1,
|
||||
hop_length=512,
|
||||
chunk_length=30,
|
||||
sampling_rate=44100,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.min_seq_length = min_seq_length
|
||||
self.max_seq_length = max_seq_length
|
||||
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
|
||||
self.spectrogram_length = spectrogram_length
|
||||
self.feature_size = feature_size
|
||||
self.num_audio_channels = num_audio_channels
|
||||
self.hop_length = hop_length
|
||||
self.chunk_length = chunk_length
|
||||
self.sampling_rate = sampling_rate
|
||||
|
||||
def prepare_feat_extract_dict(self):
|
||||
return {
|
||||
"spectrogram_length": self.spectrogram_length,
|
||||
"feature_size": self.feature_size,
|
||||
"num_audio_channels": self.num_audio_channels,
|
||||
"hop_length": self.hop_length,
|
||||
"chunk_length": self.chunk_length,
|
||||
"sampling_rate": self.sampling_rate,
|
||||
}
|
||||
|
||||
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
|
||||
def _flatten(list_of_lists):
|
||||
return list(itertools.chain(*list_of_lists))
|
||||
|
||||
if equal_length:
|
||||
speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
|
||||
else:
|
||||
# make sure that inputs increase in size
|
||||
speech_inputs = [
|
||||
floats_list((x, self.feature_size))
|
||||
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
||||
]
|
||||
if numpify:
|
||||
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
||||
return speech_inputs
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
class TvltFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
|
||||
feature_extraction_class = TvltFeatureExtractor if is_speech_available() else None
|
||||
|
||||
def setUp(self):
|
||||
self.feat_extract_tester = TvltFeatureExtractionTester(self)
|
||||
|
||||
def test_feat_extract_properties(self):
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
|
||||
self.assertTrue(hasattr(feature_extractor, "spectrogram_length"))
|
||||
self.assertTrue(hasattr(feature_extractor, "feature_size"))
|
||||
self.assertTrue(hasattr(feature_extractor, "num_audio_channels"))
|
||||
self.assertTrue(hasattr(feature_extractor, "hop_length"))
|
||||
self.assertTrue(hasattr(feature_extractor, "chunk_length"))
|
||||
self.assertTrue(hasattr(feature_extractor, "sampling_rate"))
|
||||
|
||||
def test_feat_extract_from_and_save_pretrained(self):
|
||||
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
|
||||
check_json_file_has_correct_format(saved_file)
|
||||
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
|
||||
|
||||
dict_first = feat_extract_first.to_dict()
|
||||
dict_second = feat_extract_second.to_dict()
|
||||
mel_1 = dict_first.pop("mel_filters")
|
||||
mel_2 = dict_second.pop("mel_filters")
|
||||
self.assertTrue(np.allclose(mel_1, mel_2))
|
||||
self.assertEqual(dict_first, dict_second)
|
||||
|
||||
def test_feat_extract_to_json_file(self):
|
||||
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
|
||||
feat_extract_first.to_json_file(json_file_path)
|
||||
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
|
||||
|
||||
dict_first = feat_extract_first.to_dict()
|
||||
dict_second = feat_extract_second.to_dict()
|
||||
mel_1 = dict_first.pop("mel_filters")
|
||||
mel_2 = dict_second.pop("mel_filters")
|
||||
self.assertTrue(np.allclose(mel_1, mel_2))
|
||||
self.assertEqual(dict_first, dict_second)
|
||||
|
||||
def test_call(self):
|
||||
# Initialize feature_extractor
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
|
||||
|
||||
# create three inputs of length 800, 1000, and 1200
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 20000)]
|
||||
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
|
||||
|
||||
# Test not batched input
|
||||
encoded_audios = feature_extractor(np_speech_inputs[0], return_tensors="np", sampling_rate=44100).audio_values
|
||||
|
||||
self.assertTrue(encoded_audios.ndim == 4)
|
||||
self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
|
||||
self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
|
||||
self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
|
||||
|
||||
# Test batched
|
||||
encoded_audios = feature_extractor(np_speech_inputs, return_tensors="np", sampling_rate=44100).audio_values
|
||||
|
||||
self.assertTrue(encoded_audios.ndim == 4)
|
||||
self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
|
||||
self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
|
||||
self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
|
||||
|
||||
# Test audio masking
|
||||
encoded_audios = feature_extractor(
|
||||
np_speech_inputs, return_tensors="np", sampling_rate=44100, mask_audio=True
|
||||
).audio_values
|
||||
|
||||
self.assertTrue(encoded_audios.ndim == 4)
|
||||
self.assertTrue(encoded_audios.shape[-1] == feature_extractor.feature_size)
|
||||
self.assertTrue(encoded_audios.shape[-2] <= feature_extractor.spectrogram_length)
|
||||
self.assertTrue(encoded_audios.shape[-3] == feature_extractor.num_channels)
|
||||
|
||||
def _load_datasamples(self, num_samples):
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
||||
|
||||
return [x["array"] for x in speech_samples]
|
||||
|
||||
def test_integration(self):
|
||||
input_speech = self._load_datasamples(1)
|
||||
feaure_extractor = TvltFeatureExtractor()
|
||||
audio_values = feaure_extractor(input_speech, return_tensors="pt").audio_values
|
||||
|
||||
self.assertTrue(audio_values.shape, [1, 1, 192, 128])
|
||||
|
||||
expected_slice = torch.tensor([[-0.3032, -0.2708], [-0.4434, -0.4007]])
|
||||
self.assertTrue(torch.allclose(audio_values[0, 0, :2, :2], expected_slice, atol=1e-4))
|
||||
253
tests/models/tvlt/test_image_processor_tvlt.py
Normal file
253
tests/models/tvlt/test_image_processor_tvlt.py
Normal file
@@ -0,0 +1,253 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the TVLT image processor. """
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers.testing_utils import require_torch, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_image_processing_common import ImageProcessingSavingTestMixin
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import TvltImageProcessor
|
||||
|
||||
|
||||
def prepare_video(image_processor_tester, width=10, height=10, numpify=False, torchify=False):
|
||||
"""This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
|
||||
|
||||
video = []
|
||||
for i in range(image_processor_tester.num_frames):
|
||||
video.append(np.random.randint(255, size=(image_processor_tester.num_channels, width, height), dtype=np.uint8))
|
||||
|
||||
if not numpify and not torchify:
|
||||
# PIL expects the channel dimension as last dimension
|
||||
video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
|
||||
|
||||
if torchify:
|
||||
video = [torch.from_numpy(frame) for frame in video]
|
||||
|
||||
return video
|
||||
|
||||
|
||||
def prepare_video_inputs(image_processor_tester, equal_resolution=False, numpify=False, torchify=False):
|
||||
"""This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
|
||||
one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
|
||||
One can specify whether the videos are of the same resolution or not.
|
||||
"""
|
||||
|
||||
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
|
||||
|
||||
video_inputs = []
|
||||
for i in range(image_processor_tester.batch_size):
|
||||
if equal_resolution:
|
||||
width = height = image_processor_tester.max_resolution
|
||||
else:
|
||||
width, height = np.random.choice(
|
||||
np.arange(image_processor_tester.min_resolution, image_processor_tester.max_resolution), 2
|
||||
)
|
||||
video = prepare_video(
|
||||
image_processor_tester=image_processor_tester,
|
||||
width=width,
|
||||
height=height,
|
||||
numpify=numpify,
|
||||
torchify=torchify,
|
||||
)
|
||||
video_inputs.append(video)
|
||||
|
||||
return video_inputs
|
||||
|
||||
|
||||
class TvltImageProcessorTester(unittest.TestCase):
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=7,
|
||||
num_channels=3,
|
||||
num_frames=4,
|
||||
image_size=18,
|
||||
min_resolution=30,
|
||||
max_resolution=400,
|
||||
do_resize=True,
|
||||
size=None,
|
||||
do_normalize=True,
|
||||
image_mean=[0.5, 0.5, 0.5],
|
||||
image_std=[0.5, 0.5, 0.5],
|
||||
do_center_crop=True,
|
||||
crop_size=None,
|
||||
):
|
||||
size = size if size is not None else {"shortest_edge": 18}
|
||||
crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18}
|
||||
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.num_channels = num_channels
|
||||
self.num_frames = num_frames
|
||||
self.image_size = image_size
|
||||
self.min_resolution = min_resolution
|
||||
self.max_resolution = max_resolution
|
||||
self.do_resize = do_resize
|
||||
self.size = size
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.do_center_crop = do_center_crop
|
||||
self.crop_size = crop_size
|
||||
|
||||
def prepare_image_processor_dict(self):
|
||||
return {
|
||||
"image_mean": self.image_mean,
|
||||
"image_std": self.image_std,
|
||||
"do_normalize": self.do_normalize,
|
||||
"do_resize": self.do_resize,
|
||||
"size": self.size,
|
||||
"do_center_crop": self.do_center_crop,
|
||||
"crop_size": self.crop_size,
|
||||
}
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class TvltImageProcessorTest(ImageProcessingSavingTestMixin, unittest.TestCase):
|
||||
image_processing_class = TvltImageProcessor if is_vision_available() else None
|
||||
|
||||
def setUp(self):
|
||||
self.image_processor_tester = TvltImageProcessorTester(self)
|
||||
|
||||
@property
|
||||
def image_processor_dict(self):
|
||||
return self.image_processor_tester.prepare_image_processor_dict()
|
||||
|
||||
def test_image_processor_properties(self):
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
self.assertTrue(hasattr(image_processor, "image_mean"))
|
||||
self.assertTrue(hasattr(image_processor, "image_std"))
|
||||
self.assertTrue(hasattr(image_processor, "do_normalize"))
|
||||
self.assertTrue(hasattr(image_processor, "do_resize"))
|
||||
self.assertTrue(hasattr(image_processor, "do_center_crop"))
|
||||
self.assertTrue(hasattr(image_processor, "size"))
|
||||
|
||||
def test_call_pil(self):
|
||||
# Initialize image_processor
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PIL videos
|
||||
video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, list)
|
||||
self.assertIsInstance(video[0], Image.Image)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
|
||||
self.assertEqual(
|
||||
encoded_videos.shape,
|
||||
(
|
||||
1,
|
||||
self.image_processor_tester.num_frames,
|
||||
self.image_processor_tester.num_channels,
|
||||
self.image_processor_tester.crop_size["height"],
|
||||
self.image_processor_tester.crop_size["width"],
|
||||
),
|
||||
)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
|
||||
self.assertEqual(
|
||||
encoded_videos.shape,
|
||||
(
|
||||
self.image_processor_tester.batch_size,
|
||||
self.image_processor_tester.num_frames,
|
||||
self.image_processor_tester.num_channels,
|
||||
self.image_processor_tester.crop_size["height"],
|
||||
self.image_processor_tester.crop_size["width"],
|
||||
),
|
||||
)
|
||||
|
||||
def test_call_numpy(self):
|
||||
# Initialize image_processor
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random numpy tensors
|
||||
video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, numpify=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, list)
|
||||
self.assertIsInstance(video[0], np.ndarray)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
|
||||
self.assertEqual(
|
||||
encoded_videos.shape,
|
||||
(
|
||||
1,
|
||||
self.image_processor_tester.num_frames,
|
||||
self.image_processor_tester.num_channels,
|
||||
self.image_processor_tester.crop_size["height"],
|
||||
self.image_processor_tester.crop_size["width"],
|
||||
),
|
||||
)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
|
||||
self.assertEqual(
|
||||
encoded_videos.shape,
|
||||
(
|
||||
self.image_processor_tester.batch_size,
|
||||
self.image_processor_tester.num_frames,
|
||||
self.image_processor_tester.num_channels,
|
||||
self.image_processor_tester.crop_size["height"],
|
||||
self.image_processor_tester.crop_size["width"],
|
||||
),
|
||||
)
|
||||
|
||||
def test_call_pytorch(self):
|
||||
# Initialize image_processor
|
||||
image_processor = self.image_processing_class(**self.image_processor_dict)
|
||||
# create random PyTorch tensors
|
||||
video_inputs = prepare_video_inputs(self.image_processor_tester, equal_resolution=False, torchify=True)
|
||||
for video in video_inputs:
|
||||
self.assertIsInstance(video, list)
|
||||
self.assertIsInstance(video[0], torch.Tensor)
|
||||
|
||||
# Test not batched input
|
||||
encoded_videos = image_processor(video_inputs[0], return_tensors="pt").pixel_values
|
||||
self.assertEqual(
|
||||
encoded_videos.shape,
|
||||
(
|
||||
1,
|
||||
self.image_processor_tester.num_frames,
|
||||
self.image_processor_tester.num_channels,
|
||||
self.image_processor_tester.crop_size["height"],
|
||||
self.image_processor_tester.crop_size["width"],
|
||||
),
|
||||
)
|
||||
|
||||
# Test batched
|
||||
encoded_videos = image_processor(video_inputs, return_tensors="pt").pixel_values
|
||||
self.assertEqual(
|
||||
encoded_videos.shape,
|
||||
(
|
||||
self.image_processor_tester.batch_size,
|
||||
self.image_processor_tester.num_frames,
|
||||
self.image_processor_tester.num_channels,
|
||||
self.image_processor_tester.crop_size["height"],
|
||||
self.image_processor_tester.crop_size["width"],
|
||||
),
|
||||
)
|
||||
628
tests/models/tvlt/test_modeling_tvlt.py
Normal file
628
tests/models/tvlt/test_modeling_tvlt.py
Normal file
@@ -0,0 +1,628 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the PyTorch TVLT model. """
|
||||
|
||||
import copy
|
||||
import inspect
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from transformers import (
|
||||
TvltConfig,
|
||||
is_datasets_available,
|
||||
is_speech_available,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
)
|
||||
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
|
||||
from transformers.utils import cached_property
|
||||
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import ModelTesterMixin, floats_tensor
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from transformers import TvltForAudioVisualClassification, TvltForPreTraining, TvltModel
|
||||
from transformers.models.tvlt.modeling_tvlt import TVLT_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||
from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_10
|
||||
else:
|
||||
is_torch_greater_or_equal_than_1_10 = False
|
||||
|
||||
|
||||
if is_datasets_available():
|
||||
from datasets import load_dataset
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import TvltImageProcessor
|
||||
|
||||
if is_speech_available():
|
||||
from transformers import TvltFeatureExtractor
|
||||
|
||||
|
||||
class TvltModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=2,
|
||||
image_size=32,
|
||||
spectrogram_length=32,
|
||||
frequency_length=16,
|
||||
image_patch_size=[2, 2],
|
||||
audio_patch_size=[2, 2],
|
||||
num_image_channels=3,
|
||||
num_audio_channels=1,
|
||||
num_frames=2,
|
||||
hidden_size=128,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=128,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.0,
|
||||
attention_probs_dropout_prob=0.0,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-12,
|
||||
qkv_bias=True,
|
||||
use_mean_pooling=True,
|
||||
decoder_num_attention_heads=4,
|
||||
decoder_hidden_size=64,
|
||||
decoder_num_hidden_layers=2,
|
||||
decoder_intermediate_size=128,
|
||||
image_mask_ratio=0.75,
|
||||
audio_mask_ratio=0.15,
|
||||
audio_mask_type="frame-level",
|
||||
task_matching=True,
|
||||
task_mae=True,
|
||||
num_labels=1,
|
||||
is_training=True,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.image_size = image_size
|
||||
self.spectrogram_length = spectrogram_length
|
||||
self.frequency_length = frequency_length
|
||||
self.image_patch_size = image_patch_size
|
||||
self.audio_patch_size = audio_patch_size
|
||||
self.num_image_channels = num_image_channels
|
||||
self.num_audio_channels = num_audio_channels
|
||||
self.num_frames = num_frames
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.qkv_bias = qkv_bias
|
||||
self.use_mean_pooling = use_mean_pooling
|
||||
|
||||
self.decoder_num_attention_heads = decoder_num_attention_heads
|
||||
self.decoder_hidden_size = decoder_hidden_size
|
||||
self.decoder_num_hidden_layers = decoder_num_hidden_layers
|
||||
self.decoder_intermediate_size = decoder_intermediate_size
|
||||
self.image_mask_ratio = image_mask_ratio
|
||||
self.audio_mask_ratio = audio_mask_ratio
|
||||
|
||||
self.task_matching = task_matching
|
||||
self.task_mae = task_mae
|
||||
self.num_labels = num_labels
|
||||
|
||||
self.expected_pixel_seq_len = (self.image_size // self.image_patch_size[0]) ** 2 * self.num_frames
|
||||
self.expected_audio_seq_len = (self.spectrogram_length // self.audio_patch_size[0]) * (
|
||||
self.frequency_length // self.audio_patch_size[1]
|
||||
)
|
||||
# we set the expected sequence length (which is used in several tests)
|
||||
# this is equal to the seq length of number of image/video patches + number of audio patches
|
||||
self.expected_seq_len = self.expected_pixel_seq_len + self.expected_audio_seq_len + 1
|
||||
|
||||
self.image_mae_output_dim = image_patch_size[0] ** 2 * num_image_channels
|
||||
self.audio_mae_output_dim = audio_patch_size[0] * audio_patch_size[1] * num_audio_channels
|
||||
self.is_training = is_training
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
pixel_values = floats_tensor(
|
||||
[self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
|
||||
)
|
||||
audio_values = floats_tensor(
|
||||
[self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
|
||||
)
|
||||
|
||||
pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
|
||||
audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
return (config, pixel_values, audio_values, pixel_mask, audio_mask)
|
||||
|
||||
def prepare_config_and_inputs_for_pretraining(self):
|
||||
pixel_values = floats_tensor(
|
||||
[self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
|
||||
)
|
||||
audio_values = floats_tensor(
|
||||
[self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
|
||||
)
|
||||
|
||||
pixel_mask = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
|
||||
audio_mask = floats_tensor([self.batch_size, self.expected_audio_seq_len])
|
||||
|
||||
pixel_values_mixed = floats_tensor(
|
||||
[self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
|
||||
)
|
||||
pixel_mask_mixed = floats_tensor([self.batch_size, self.expected_pixel_seq_len])
|
||||
labels = floats_tensor([self.batch_size])
|
||||
config = self.get_config()
|
||||
|
||||
return (
|
||||
config,
|
||||
pixel_values,
|
||||
audio_values,
|
||||
pixel_mask,
|
||||
audio_mask,
|
||||
pixel_values_mixed,
|
||||
pixel_mask_mixed,
|
||||
labels,
|
||||
)
|
||||
|
||||
def get_config(self):
|
||||
return TvltConfig(
|
||||
image_size=self.image_size,
|
||||
spectrogram_length=self.spectrogram_length,
|
||||
frequency_length=self.frequency_length,
|
||||
image_patch_size=self.image_patch_size,
|
||||
audio_patch_size=self.audio_patch_size,
|
||||
num_image_channels=self.num_image_channels,
|
||||
num_audio_channels=self.num_audio_channels,
|
||||
num_frames=self.num_frames,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
intermediate_size=self.intermediate_size,
|
||||
hidden_act=self.hidden_act,
|
||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
initializer_range=self.initializer_range,
|
||||
layer_norm_eps=self.layer_norm_eps,
|
||||
qkv_bias=self.qkv_bias,
|
||||
use_mean_pooling=self.use_mean_pooling,
|
||||
decoder_num_attention_heads=self.decoder_num_attention_heads,
|
||||
decoder_hidden_size=self.decoder_hidden_size,
|
||||
decoder_num_hidden_layers=self.decoder_num_hidden_layers,
|
||||
decoder_intermediate_size=self.decoder_intermediate_size,
|
||||
image_mask_ratio=self.image_mask_ratio,
|
||||
audio_mask_ratio=self.audio_mask_ratio,
|
||||
task_matching=self.task_matching,
|
||||
task_mae=self.task_mae,
|
||||
num_labels=self.num_labels,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, pixel_values, audio_values, pixel_mask, audio_mask):
|
||||
model = TvltModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
|
||||
result = model(pixel_values, audio_values)
|
||||
self.parent.assertEqual(
|
||||
result.last_hidden_state.shape, (self.batch_size, self.expected_seq_len, self.hidden_size)
|
||||
)
|
||||
|
||||
def create_and_check_for_audiovisual_classification(
|
||||
self, config, pixel_values, audio_values, pixel_mask, audio_mask
|
||||
):
|
||||
model = TvltForAudioVisualClassification(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(pixel_values, audio_values, pixel_mask=pixel_mask, audio_mask=audio_mask)
|
||||
result = model(pixel_values, audio_values)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
|
||||
|
||||
def create_and_check_for_pretraining(
|
||||
self,
|
||||
config,
|
||||
pixel_values,
|
||||
audio_values,
|
||||
pixel_mask,
|
||||
audio_mask,
|
||||
pixel_values_mixed,
|
||||
pixel_mask_mixed,
|
||||
labels,
|
||||
):
|
||||
model = TvltForPreTraining(config=config)
|
||||
model.to(torch_device)
|
||||
model.train()
|
||||
result = model(
|
||||
pixel_values,
|
||||
audio_values,
|
||||
pixel_mask,
|
||||
audio_mask,
|
||||
pixel_values_mixed=pixel_values_mixed,
|
||||
pixel_mask_mixed=pixel_mask_mixed,
|
||||
labels=labels,
|
||||
)
|
||||
self.parent.assertEqual(
|
||||
result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
|
||||
)
|
||||
self.parent.assertEqual(
|
||||
result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
|
||||
)
|
||||
self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
|
||||
|
||||
def create_and_check_for_pretraining_inference(
|
||||
self,
|
||||
config,
|
||||
pixel_values,
|
||||
audio_values,
|
||||
pixel_mask,
|
||||
audio_mask,
|
||||
pixel_values_mixed,
|
||||
pixel_mask_mixed,
|
||||
labels,
|
||||
):
|
||||
model = TvltForPreTraining(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(
|
||||
pixel_values,
|
||||
audio_values,
|
||||
pixel_mask,
|
||||
audio_mask,
|
||||
pixel_values_mixed=pixel_values_mixed,
|
||||
pixel_mask_mixed=pixel_mask_mixed,
|
||||
labels=labels,
|
||||
)
|
||||
if result.pixel_logits is not None:
|
||||
self.parent.assertEqual(
|
||||
result.pixel_logits.shape, (self.batch_size, self.expected_pixel_seq_len, self.image_mae_output_dim)
|
||||
)
|
||||
if result.audio_logits is not None:
|
||||
self.parent.assertEqual(
|
||||
result.audio_logits.shape, (self.batch_size, self.expected_audio_seq_len, self.audio_mae_output_dim)
|
||||
)
|
||||
self.parent.assertEqual(result.matching_logits.shape, (self.batch_size, self.num_labels))
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
(config, pixel_values, audio_values, pixel_mask, audio_mask) = config_and_inputs
|
||||
inputs_dict = {
|
||||
"pixel_values": pixel_values,
|
||||
"audio_values": audio_values,
|
||||
"pixel_mask": pixel_mask,
|
||||
"audio_mask": audio_mask,
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
def prepare_pixel_values(self):
|
||||
return floats_tensor(
|
||||
[self.batch_size, self.num_frames, self.num_image_channels, self.image_size, self.image_size]
|
||||
)
|
||||
|
||||
def prepare_audio_values(self):
|
||||
return floats_tensor(
|
||||
[self.batch_size, self.num_audio_channels, self.spectrogram_length, self.frequency_length]
|
||||
)
|
||||
|
||||
|
||||
@require_torch
|
||||
@unittest.skipIf(not is_torch_greater_or_equal_than_1_10, "TVLT is only available in torch v1.10+")
|
||||
class TvltModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (
|
||||
(TvltModel, TvltForPreTraining, TvltForAudioVisualClassification) if is_torch_available() else ()
|
||||
)
|
||||
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_headmasking = False
|
||||
test_torchscript = False
|
||||
test_resize_embeddings = False
|
||||
main_input_name = "pixel_values"
|
||||
|
||||
# TvltForAudioVisualClassification and TvltForPreTraining require special treatment
|
||||
def _prepare_for_class(self, inputs_dict, model_class, return_labels=True):
|
||||
inputs_dict = copy.deepcopy(inputs_dict)
|
||||
|
||||
if return_labels:
|
||||
if model_class.__name__ == "TvltForAudioVisualClassification":
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size,), dtype=torch.long, device=torch_device
|
||||
)
|
||||
elif model_class.__name__ == "TvltForPreTraining":
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
(self.model_tester.batch_size,), dtype=torch.float, device=torch_device
|
||||
)
|
||||
inputs_dict["pixel_values_mixed"] = torch.zeros(
|
||||
(
|
||||
self.model_tester.batch_size,
|
||||
self.model_tester.num_frames,
|
||||
self.model_tester.num_image_channels,
|
||||
self.model_tester.image_size,
|
||||
self.model_tester.image_size,
|
||||
),
|
||||
dtype=torch.float,
|
||||
device=torch_device,
|
||||
)
|
||||
inputs_dict["pixel_mask_mixed"] = torch.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.expected_pixel_seq_len),
|
||||
dtype=torch.float,
|
||||
device=torch_device,
|
||||
)
|
||||
|
||||
return inputs_dict
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = TvltModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=TvltConfig, has_text_modality=False, hidden_size=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="TVLT does not use inputs_embeds")
|
||||
def test_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
def test_model_common_attributes(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
input_embeddings = model.get_input_embeddings()
|
||||
self.assertIsInstance(input_embeddings, (tuple))
|
||||
for embedding in input_embeddings:
|
||||
self.assertIsInstance(embedding, (nn.Module))
|
||||
x = model.get_output_embeddings()
|
||||
self.assertTrue(x is None or isinstance(x, nn.Linear))
|
||||
|
||||
def test_forward_signature(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
signature = inspect.signature(model.forward)
|
||||
# signature.parameters is an OrderedDict => so arg_names order is deterministic
|
||||
arg_names = [*signature.parameters.keys()]
|
||||
|
||||
expected_arg_names = ["pixel_values", "audio_values"]
|
||||
self.assertListEqual(arg_names[:2], expected_arg_names)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_for_audiovisual_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_audiovisual_classification(*config_and_inputs)
|
||||
|
||||
def test_for_pretraining(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pretraining()
|
||||
self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
|
||||
self.model_tester.create_and_check_for_pretraining_inference(*config_and_inputs)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in TVLT_PRETRAINED_MODEL_ARCHIVE_LIST:
|
||||
model = TvltModel.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
def test_training(self):
|
||||
if not self.model_tester.is_training:
|
||||
return
|
||||
|
||||
for model_class in self.all_model_classes[1:]:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.return_dict = True
|
||||
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.train()
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
for k, v in inputs.items():
|
||||
print(k, v.shape)
|
||||
loss = model(**inputs).loss
|
||||
loss.backward()
|
||||
|
||||
def test_training_gradient_checkpointing(self):
|
||||
if not self.model_tester.is_training:
|
||||
return
|
||||
|
||||
for model_class in self.all_model_classes[1:]:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.use_cache = False
|
||||
config.return_dict = True
|
||||
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.gradient_checkpointing_enable()
|
||||
model.train()
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
loss = model(**inputs).loss
|
||||
loss.backward()
|
||||
|
||||
def test_attention_outputs(self):
|
||||
if not self.has_attentions:
|
||||
pass
|
||||
|
||||
else:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.return_dict = True
|
||||
|
||||
for model_class in self.all_model_classes[2:]:
|
||||
seq_len = self.model_tester.expected_seq_len
|
||||
|
||||
inputs_dict["output_attentions"] = True
|
||||
inputs_dict["output_hidden_states"] = False
|
||||
config.return_dict = True
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
attentions = outputs.attentions
|
||||
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
|
||||
|
||||
# check that output_attentions also work using config
|
||||
del inputs_dict["output_attentions"]
|
||||
config.output_attentions = True
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
attentions = outputs.attentions
|
||||
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
|
||||
|
||||
self.assertListEqual(
|
||||
list(attentions[0].shape[-3:]),
|
||||
[self.model_tester.num_attention_heads, seq_len, seq_len],
|
||||
)
|
||||
out_len = len(outputs)
|
||||
|
||||
# Check attention is always last and order is fine
|
||||
inputs_dict["output_attentions"] = True
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
self.assertEqual(out_len + 1, len(outputs))
|
||||
|
||||
self_attentions = outputs.attentions
|
||||
|
||||
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
|
||||
self.assertListEqual(
|
||||
list(self_attentions[0].shape[-3:]),
|
||||
[self.model_tester.num_attention_heads, seq_len, seq_len],
|
||||
)
|
||||
|
||||
def test_hidden_states_output(self):
|
||||
def check_hidden_states_output(inputs_dict, config, model_class):
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
hidden_states = outputs.hidden_states
|
||||
expected_num_layers = self.model_tester.num_hidden_layers + 1
|
||||
self.assertEqual(len(hidden_states), expected_num_layers)
|
||||
|
||||
seq_length = self.model_tester.expected_seq_len
|
||||
|
||||
self.assertListEqual(
|
||||
list(hidden_states[0].shape[-2:]),
|
||||
[seq_length, self.model_tester.hidden_size],
|
||||
)
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes[2:]:
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
# check that output_hidden_states also work using config
|
||||
del inputs_dict["output_hidden_states"]
|
||||
config.output_hidden_states = True
|
||||
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
|
||||
# We will verify our results on a video of eating spaghetti
|
||||
# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
|
||||
def prepare_video(num_frames=8):
|
||||
file = hf_hub_download(
|
||||
repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
|
||||
)
|
||||
video = np.load(file)[:num_frames]
|
||||
return list(video)
|
||||
|
||||
|
||||
def prepare_audio(num_samples=1):
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
||||
return [x["array"] for x in speech_samples]
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class TvltModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
# logits were tested with a different mean and std, so we use the same here
|
||||
return (
|
||||
TvltImageProcessor() if is_vision_available() else None,
|
||||
TvltFeatureExtractor(),
|
||||
)
|
||||
|
||||
def test_inference_for_base_model(self):
|
||||
model = TvltModel.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
|
||||
|
||||
image_processor, audio_feature_extractor = self.default_feature_extractor
|
||||
video = prepare_video()
|
||||
audio = prepare_audio()
|
||||
video_inputs = image_processor(video, return_tensors="pt").to(torch_device)
|
||||
audio_inputs = audio_feature_extractor(audio, return_tensors="pt").to(torch_device)
|
||||
inputs = dict()
|
||||
inputs.update(video_inputs)
|
||||
inputs.update(audio_inputs)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
|
||||
# verify the logits
|
||||
expected_last_hidden_state_slice = torch.tensor([[-0.0186, -0.0691], [0.0242, -0.0398]])
|
||||
self.assertTrue(
|
||||
torch.allclose(outputs.last_hidden_state[:, :2, :2], expected_last_hidden_state_slice, atol=1e-4)
|
||||
)
|
||||
|
||||
def test_inference_for_pretraining(self):
|
||||
model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base").to(torch_device)
|
||||
|
||||
image_processor, audio_feature_extractor = self.default_feature_extractor
|
||||
video = prepare_video()
|
||||
video_mixed = prepare_video()
|
||||
audio = prepare_audio()
|
||||
video_inputs = image_processor(video, return_tensors="pt", mask_pixel=True).to(torch_device)
|
||||
video_mixed_inputs = image_processor(video_mixed, is_mixed=True, return_tensors="pt").to(torch_device)
|
||||
audio_inputs = audio_feature_extractor(audio, return_tensors="pt", mask_audio=True).to(torch_device)
|
||||
labels = torch.tensor([[0.0]], device=torch_device)
|
||||
inputs = dict()
|
||||
inputs.update(video_inputs)
|
||||
inputs.update(video_mixed_inputs)
|
||||
inputs.update(audio_inputs)
|
||||
inputs.update({"labels": labels})
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
|
||||
# verify the logits
|
||||
expected_pixel_logits_shape = torch.Size([1, 1568, 768])
|
||||
expected_audio_logits_shape = torch.Size([1, 96, 256])
|
||||
expected_matching_logits_shape = torch.Size([1, 1])
|
||||
|
||||
if outputs.pixel_logits is not None:
|
||||
self.assertEqual(outputs.pixel_logits.shape, expected_pixel_logits_shape)
|
||||
if outputs.audio_logits is not None:
|
||||
self.assertEqual(outputs.audio_logits.shape, expected_audio_logits_shape)
|
||||
self.assertTrue(outputs.matching_logits.shape, expected_matching_logits_shape)
|
||||
116
tests/models/tvlt/test_processor_tvlt.py
Normal file
116
tests/models/tvlt/test_processor_tvlt.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from transformers import is_speech_available, is_vision_available
|
||||
from transformers.testing_utils import require_torch
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from transformers import TvltImageProcessor
|
||||
|
||||
if is_speech_available():
|
||||
from transformers import TvltFeatureExtractor
|
||||
|
||||
from transformers import TvltProcessor
|
||||
|
||||
|
||||
@require_torch
|
||||
class TvltProcessorTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.checkpoint = "ZinengTang/tvlt-base"
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
def get_image_processor(self, **kwargs):
|
||||
return TvltImageProcessor.from_pretrained(self.checkpoint, **kwargs)
|
||||
|
||||
def get_feature_extractor(self, **kwargs):
|
||||
return TvltFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdirname)
|
||||
|
||||
def test_save_load_pretrained_default(self):
|
||||
image_processor = self.get_image_processor()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor = TvltProcessor.from_pretrained(self.tmpdirname)
|
||||
|
||||
self.assertIsInstance(processor.feature_extractor, TvltFeatureExtractor)
|
||||
self.assertIsInstance(processor.image_processor, TvltImageProcessor)
|
||||
|
||||
def test_feature_extractor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
|
||||
|
||||
audio = np.ones([12000])
|
||||
|
||||
audio_dict = feature_extractor(audio, return_tensors="np")
|
||||
input_processor = processor(audio=audio, return_tensors="np")
|
||||
|
||||
for key in audio_dict.keys():
|
||||
self.assertAlmostEqual(audio_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
def test_image_processor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
|
||||
|
||||
images = np.ones([3, 224, 224])
|
||||
|
||||
image_dict = image_processor(images, return_tensors="np")
|
||||
input_processor = processor(images=images, return_tensors="np")
|
||||
|
||||
for key in image_dict.keys():
|
||||
self.assertAlmostEqual(image_dict[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
def test_processor(self):
|
||||
image_processor = self.get_image_processor()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
|
||||
|
||||
audio = np.ones([12000])
|
||||
images = np.ones([3, 224, 224])
|
||||
|
||||
inputs = processor(audio=audio, images=images)
|
||||
|
||||
self.assertListEqual(list(inputs.keys()), ["audio_values", "audio_mask", "pixel_values", "pixel_mask"])
|
||||
|
||||
# test if it raises when no input is passed
|
||||
with pytest.raises(ValueError):
|
||||
processor()
|
||||
|
||||
def test_model_input_names(self):
|
||||
image_processor = self.get_image_processor()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = TvltProcessor(image_processor=image_processor, feature_extractor=feature_extractor)
|
||||
|
||||
self.assertListEqual(
|
||||
processor.model_input_names,
|
||||
image_processor.model_input_names + feature_extractor.model_input_names,
|
||||
msg="`processor` and `image_processor`+`feature_extractor` model input names do not match",
|
||||
)
|
||||
Reference in New Issue
Block a user