[CLAP] Add CLAP to the library (#21370)
* add model like clip * update * text model ok * clap text works * some refactor - `CLAPVision` to `CLAPAudio` - refactor kwargs of audio modules * more refactor * more refactor * more refactor * correct fusion * more refactor * new modules * add basic processor * fixup * remove whisper copioed from * audio logits match * add doc * correct filters mel and add maxlength * style * few fixes * forward passes * fixup * fixup * some clean up * remove mels form the dictionnary * pad after the repeat * update padding when dsmaller * fix padding * style * use swin patch merging * use copied from swin * processor with any tokenizer * more copied from * some clean up * more refactor * fix mel when rand_trunc * style * remove unused imports * update processing * remove image processing tests * add testing fiel * fixmodeling issues * replace with `is_longer` * clap in serialization * more refactor * `make fixup` * make fixup * fix feature extractor * update test feature extractor * `make fixup` * clean up config * more clean up * more cleanup * update tests * refactor tests and inits * removeCLAP vision config * remove CLAP from image procssing auto and dummy vision objects * update inits * style * re order classes in modeling clap * Use roberta tokenizer as the other weights are not open sourced * small cleaup * remove tokenization CLAP * processor tokenizr is roberta * update feature extraction doc * remove vclap from model zero shot * update f_min and f_max to frequency_xx * some changes - fix modeling keys - add `is_longer` in the forward pass - make fixup * make fixup * consistent behavior ebtween rand_crop and fusion * add numpy resize and bilinear and documentation * move resizing to image utils * clean feature extraction * import resize from correct file * resize in image transforms * update * style * style * nit * remove unused arguments form the feature extractor * style * few fixes + make fixup * oops * fix more tests * add zero shot audio classification pipeline * update zeroshot classification pipeline * fixup * fix copies * all CI tests pass * make fixup + fix docs * fix docs * fix docs * update tests pip;eline * update zero shot pipeline * update feature extraction clap * update tokenization auto * use nested simplify * update pipeline tests * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * split in two lines * fixes * refactor * clean up * add integration tests * update config docstring * style * update processor * fix processor test * fix feat extractor tests * update docs * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fix readmes * fix tips * Update src/transformers/models/auto/configuration_auto.py * update doc and remove todo -> properly explained * fix idx and typo * typoe * cleanup config * cleanup tests, styles and doc * ignore docstyle on image transform * add conversion script * remove the `clap` indx in favor of `CLAP` * update __init * nits * Update src/transformers/pipelines/__init__.py * fix bug * clarifiy config * fix copy * fix init * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * fix model output * fix comment * make fixup * make fixup * rename to `Clap` * replace to `Clap` * replace to `Clap` * repo consistency * again repo-consistency * make fixup * Apply suggestions from code review Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * add config * changes * update conversion * Apply suggestions from code review Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> * remove unused function * update based on code reviews * style * more comments * cleanup * clean up * style * apply suggestions * Empty commit * pipeline will be added in a different PR * update calls to audio utils functions * update pipeline init * style * style * styling again * use pad * fix repo-consistency * update utils and add doc for audio utils * clean up resize by using torch. update inits accordingly * style * CLap's tokenizer is RobertA * add audio utils to internal toctreee * update totctree * style * update documentation and normalize naming accross audio utils and feature extraction clap * style * clean up * update doc and typos * fix doctest * update modelin code, got rid of a lot of reshaping * style on added doc audio utils * update modeling clap * style * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * docstringvariables with CLAP * rename key * update modeling CLAP * update audio utils docstring * update processing clap * fix readmes * fix toctree * udpate configuration clap * fix init * make fixup * fix * fix * update naming * update * update checkpoint path * Apply suggestions from code review * Major refactoring * Update src/transformers/models/clap/configuration_clap.py * merge --------- Co-authored-by: younesbelkada <younesbelkada@gmail.com> Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
This commit is contained in:
0
tests/models/clap/__init__.py
Normal file
0
tests/models/clap/__init__.py
Normal file
267
tests/models/clap/test_feature_extraction_clap.py
Normal file
267
tests/models/clap/test_feature_extraction_clap.py
Normal file
@@ -0,0 +1,267 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import itertools
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers import ClapFeatureExtractor
|
||||
from transformers.testing_utils import require_torch, require_torchaudio
|
||||
from transformers.utils.import_utils import is_torch_available
|
||||
|
||||
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
global_rng = random.Random()
|
||||
|
||||
|
||||
# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
|
||||
def floats_list(shape, scale=1.0, rng=None, name=None):
|
||||
"""Creates a random float32 tensor"""
|
||||
if rng is None:
|
||||
rng = global_rng
|
||||
|
||||
values = []
|
||||
for batch_idx in range(shape[0]):
|
||||
values.append([])
|
||||
for _ in range(shape[1]):
|
||||
values[-1].append(rng.random() * scale)
|
||||
|
||||
return values
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTester with Whisper->Clap
|
||||
class ClapFeatureExtractionTester(unittest.TestCase):
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=7,
|
||||
min_seq_length=400,
|
||||
max_seq_length=2000,
|
||||
feature_size=10,
|
||||
hop_length=160,
|
||||
chunk_length=8,
|
||||
padding_value=0.0,
|
||||
sampling_rate=4_000,
|
||||
return_attention_mask=False,
|
||||
do_normalize=True,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.min_seq_length = min_seq_length
|
||||
self.max_seq_length = max_seq_length
|
||||
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
|
||||
self.padding_value = padding_value
|
||||
self.sampling_rate = sampling_rate
|
||||
self.return_attention_mask = return_attention_mask
|
||||
self.do_normalize = do_normalize
|
||||
self.feature_size = feature_size
|
||||
self.chunk_length = chunk_length
|
||||
self.hop_length = hop_length
|
||||
|
||||
def prepare_feat_extract_dict(self):
|
||||
return {
|
||||
"feature_size": self.feature_size,
|
||||
"hop_length": self.hop_length,
|
||||
"chunk_length": self.chunk_length,
|
||||
"padding_value": self.padding_value,
|
||||
"sampling_rate": self.sampling_rate,
|
||||
"return_attention_mask": self.return_attention_mask,
|
||||
"do_normalize": self.do_normalize,
|
||||
}
|
||||
|
||||
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
|
||||
def _flatten(list_of_lists):
|
||||
return list(itertools.chain(*list_of_lists))
|
||||
|
||||
if equal_length:
|
||||
speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
|
||||
else:
|
||||
# make sure that inputs increase in size
|
||||
speech_inputs = [
|
||||
floats_list((x, self.feature_size))
|
||||
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
||||
]
|
||||
if numpify:
|
||||
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
||||
return speech_inputs
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest with Whisper->Clap
|
||||
class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
|
||||
feature_extraction_class = ClapFeatureExtractor
|
||||
|
||||
def setUp(self):
|
||||
self.feat_extract_tester = ClapFeatureExtractionTester(self)
|
||||
|
||||
def test_call(self):
|
||||
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
# create three inputs of length 800, 1000, and 1200
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
||||
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
|
||||
|
||||
# Test feature size
|
||||
input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
|
||||
self.assertTrue(input_features.ndim == 4)
|
||||
|
||||
# Test not batched input
|
||||
encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
|
||||
encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
|
||||
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
|
||||
|
||||
# Test batched
|
||||
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
|
||||
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
|
||||
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
||||
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
||||
|
||||
def test_double_precision_pad(self):
|
||||
import torch
|
||||
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
|
||||
py_speech_inputs = np_speech_inputs.tolist()
|
||||
|
||||
for inputs in [py_speech_inputs, np_speech_inputs]:
|
||||
np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
|
||||
self.assertTrue(np_processed.input_features.dtype == np.float32)
|
||||
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
|
||||
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
|
||||
|
||||
def _load_datasamples(self, num_samples):
|
||||
from datasets import load_dataset
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
||||
|
||||
return [x["array"] for x in speech_samples]
|
||||
|
||||
def integration_test_fusion(self):
|
||||
# fmt: off
|
||||
EXPECTED_INPUT_FEATURES = torch.tensor(
|
||||
[
|
||||
[
|
||||
-30.2194, -22.4424, -18.6442, -17.2452, -22.7392, -32.2576, -36.1404,
|
||||
-35.6120, -29.6229, -29.0454, -32.2157, -36.7664, -29.4436, -26.7825,
|
||||
-31.1811, -38.3918, -38.8749, -43.4485, -47.6236, -38.7528, -31.8574,
|
||||
-39.0591, -41.3190, -32.3319, -31.4699, -33.4502, -36.7412, -34.5265,
|
||||
-35.1091, -40.4518, -42.7346, -44.5909, -44.9747, -45.8328, -47.0772,
|
||||
-46.2723, -44.3613, -48.6253, -44.9551, -43.8700, -44.6104, -48.0146,
|
||||
-42.7614, -47.3587, -47.4369, -45.5018, -47.0198, -42.8759, -47.5056,
|
||||
-47.1567, -49.2621, -49.5643, -48.4330, -48.8495, -47.2512, -40.8439,
|
||||
-48.1234, -49.1218, -48.7222, -50.2399, -46.8487, -41.9921, -50.4015,
|
||||
-50.7827
|
||||
],
|
||||
[
|
||||
-89.0141, -89.1411, -88.8096, -88.5480, -88.3481, -88.2038,
|
||||
-88.1105, -88.0647, -88.0636, -88.1051, -88.1877, -88.1110,
|
||||
-87.8613, -88.6679, -88.2685, -88.9684, -88.7977, -89.6264,
|
||||
-89.9299, -90.3184, -91.1446, -91.9265, -92.7267, -93.6099,
|
||||
-94.6395, -95.3243, -95.5923, -95.5773, -95.0889, -94.3354,
|
||||
-93.5746, -92.9287, -92.4525, -91.9798, -91.8852, -91.7500,
|
||||
-91.7259, -91.7561, -91.7959, -91.7070, -91.6914, -91.5019,
|
||||
-91.0640, -90.0807, -88.7102, -87.0826, -85.5956, -84.4441,
|
||||
-83.8461, -83.8605, -84.6702, -86.3900, -89.3073, -93.2926,
|
||||
-96.3813, -97.3529, -100.0000, -99.6942, -92.2851, -87.9588,
|
||||
-85.7214, -84.6807, -84.1940, -84.2021
|
||||
],
|
||||
[
|
||||
-51.6882, -50.6852, -50.8198, -51.7428, -53.0325, -54.1619, -56.4903,
|
||||
-59.0314, -60.7996, -60.5164, -59.9680, -60.5393, -62.5796, -65.4166,
|
||||
-65.6149, -65.1409, -65.7226, -67.9057, -72.5089, -82.3530, -86.3189,
|
||||
-83.4241, -79.1279, -79.3384, -82.7335, -79.8316, -80.2167, -74.3638,
|
||||
-71.3930, -75.3849, -74.5381, -71.4504, -70.3791, -71.4547, -71.8820,
|
||||
-67.3885, -69.5686, -71.9852, -71.0307, -73.0053, -80.8802, -72.9227,
|
||||
-63.8526, -60.3260, -59.6012, -57.8316, -61.0603, -67.3403, -67.1709,
|
||||
-60.4967, -60.5079, -68.3345, -67.5213, -70.6416, -79.6219, -78.2198,
|
||||
-74.6851, -69.5718, -69.4968, -70.6882, -66.8175, -73.8558, -74.3855,
|
||||
-72.9405
|
||||
]
|
||||
]
|
||||
)
|
||||
# fmt: on
|
||||
MEL_BIN = [963, 963, 161]
|
||||
input_speech = self._load_datasamples(1)
|
||||
feaure_extractor = ClapFeatureExtractor()
|
||||
for padding, EXPECTED_VALUES, idx_in_mel in zip(
|
||||
["repeat", "repeatpad", None], EXPECTED_INPUT_FEATURES, MEL_BIN
|
||||
):
|
||||
input_features = feaure_extractor(input_speech, return_tensors="pt", padding=padding).input_features
|
||||
self.assertTrue(torch.allclose(input_features[0, idx_in_mel], EXPECTED_VALUES, atol=1e-4))
|
||||
|
||||
def integration_test_rand_trunc(self):
|
||||
# TODO in this case we should set the seed and use a longer audio to properly see the random truncation
|
||||
# fmt: off
|
||||
EXPECTED_INPUT_FEATURES = torch.tensor(
|
||||
[
|
||||
[
|
||||
-42.3330, -36.2735, -35.9231, -43.5947, -48.4525, -46.5227, -42.6477,
|
||||
-47.2740, -51.4336, -50.0846, -51.8711, -50.4232, -47.4736, -54.2275,
|
||||
-53.3947, -55.4904, -54.8750, -54.5510, -55.4156, -57.4395, -51.7385,
|
||||
-55.9118, -57.7800, -63.2064, -67.0651, -61.4379, -56.4268, -54.8667,
|
||||
-52.3487, -56.4418, -57.1842, -55.1005, -55.6366, -59.4395, -56.8604,
|
||||
-56.4949, -61.6573, -61.0826, -60.3250, -63.7876, -67.4882, -60.2323,
|
||||
-54.6886, -50.5369, -47.7656, -45.8909, -49.1273, -57.4141, -58.3201,
|
||||
-51.9862, -51.4897, -59.2561, -60.4730, -61.2203, -69.3174, -69.7464,
|
||||
-65.5861, -58.9921, -59.5610, -61.0584, -58.1149, -64.4045, -66.2622,
|
||||
-64.4610
|
||||
],
|
||||
[
|
||||
-41.2298, -38.4211, -39.8834, -45.9950, -47.3839, -43.9849, -46.0371,
|
||||
-52.5490, -56.6912, -51.8794, -50.1284, -49.7506, -53.9422, -63.2854,
|
||||
-56.5754, -55.0469, -55.3181, -55.8115, -56.0058, -57.9215, -58.7597,
|
||||
-59.1994, -59.2141, -64.4198, -73.5138, -64.4647, -59.3351, -54.5626,
|
||||
-54.7508, -65.0230, -60.0270, -54.7644, -56.0108, -60.1531, -57.6879,
|
||||
-56.3766, -63.3395, -65.3032, -61.5202, -63.0677, -68.4217, -60.6868,
|
||||
-54.4619, -50.8533, -47.7200, -45.9197, -49.0961, -57.7621, -59.0750,
|
||||
-51.9122, -51.4332, -59.4132, -60.3415, -61.6558, -70.7049, -69.7905,
|
||||
-66.9104, -59.0324, -59.6138, -61.2023, -58.2169, -65.3837, -66.4425,
|
||||
-64.4142
|
||||
],
|
||||
[
|
||||
-51.6882, -50.6852, -50.8198, -51.7428, -53.0325, -54.1619, -56.4903,
|
||||
-59.0314, -60.7996, -60.5164, -59.9680, -60.5393, -62.5796, -65.4166,
|
||||
-65.6149, -65.1409, -65.7226, -67.9057, -72.5089, -82.3530, -86.3189,
|
||||
-83.4241, -79.1279, -79.3384, -82.7335, -79.8316, -80.2167, -74.3638,
|
||||
-71.3930, -75.3849, -74.5381, -71.4504, -70.3791, -71.4547, -71.8820,
|
||||
-67.3885, -69.5686, -71.9852, -71.0307, -73.0053, -80.8802, -72.9227,
|
||||
-63.8526, -60.3260, -59.6012, -57.8316, -61.0603, -67.3403, -67.1709,
|
||||
-60.4967, -60.5079, -68.3345, -67.5213, -70.6416, -79.6219, -78.2198,
|
||||
-74.6851, -69.5718, -69.4968, -70.6882, -66.8175, -73.8558, -74.3855,
|
||||
-72.9405
|
||||
]
|
||||
]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
input_speech = self._load_datasamples(1)
|
||||
feaure_extractor = ClapFeatureExtractor()
|
||||
for padding, EXPECTED_VALUES in zip(["repeat", "repeatpad", None], EXPECTED_INPUT_FEATURES):
|
||||
input_features = feaure_extractor(
|
||||
input_speech, return_tensors="pt", truncation="rand_trunc", padding=padding
|
||||
).input_features
|
||||
self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_VALUES, atol=1e-4))
|
||||
665
tests/models/clap/test_modeling_clap.py
Normal file
665
tests/models/clap/test_modeling_clap.py
Normal file
@@ -0,0 +1,665 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the PyTorch CLAP model. """
|
||||
|
||||
|
||||
import inspect
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
from datasets import load_dataset
|
||||
|
||||
from transformers import ClapAudioConfig, ClapConfig, ClapProcessor, ClapTextConfig
|
||||
from transformers.testing_utils import require_torch, slow, torch_device
|
||||
from transformers.utils import is_torch_available
|
||||
|
||||
from ...test_configuration_common import ConfigTester
|
||||
from ...test_modeling_common import (
|
||||
ModelTesterMixin,
|
||||
_config_zero_init,
|
||||
floats_tensor,
|
||||
ids_tensor,
|
||||
random_attention_mask,
|
||||
)
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from transformers import (
|
||||
ClapAudioModel,
|
||||
ClapAudioModelWithProjection,
|
||||
ClapModel,
|
||||
ClapTextModel,
|
||||
ClapTextModelWithProjection,
|
||||
)
|
||||
from transformers.models.clap.modeling_clap import CLAP_PRETRAINED_MODEL_ARCHIVE_LIST
|
||||
|
||||
|
||||
class ClapAudioModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=12,
|
||||
image_size=60,
|
||||
num_mel_bins=16,
|
||||
window_size=4,
|
||||
spec_size=64,
|
||||
patch_size=2,
|
||||
patch_stride=2,
|
||||
seq_length=16,
|
||||
freq_ratio=2,
|
||||
num_channels=3,
|
||||
is_training=True,
|
||||
hidden_size=256,
|
||||
patch_embeds_hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=4,
|
||||
num_heads=[2, 2, 2, 2],
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
initializer_range=0.02,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.image_size = image_size
|
||||
self.num_mel_bins = num_mel_bins
|
||||
self.window_size = window_size
|
||||
self.patch_size = patch_size
|
||||
self.num_channels = num_channels
|
||||
self.is_training = is_training
|
||||
self.hidden_size = hidden_size
|
||||
self.projection_dim = projection_dim
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_heads = num_heads
|
||||
self.num_attention_heads = num_heads[0]
|
||||
self.seq_length = seq_length
|
||||
self.spec_size = spec_size
|
||||
self.freq_ratio = freq_ratio
|
||||
self.patch_stride = patch_stride
|
||||
self.patch_embeds_hidden_size = patch_embeds_hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.initializer_range = initializer_range
|
||||
self.scope = scope
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_features = floats_tensor([self.batch_size, 1, self.hidden_size, self.num_mel_bins])
|
||||
config = self.get_config()
|
||||
|
||||
return config, input_features
|
||||
|
||||
def get_config(self):
|
||||
return ClapAudioConfig(
|
||||
image_size=self.image_size,
|
||||
patch_size=self.patch_size,
|
||||
num_mel_bins=self.num_mel_bins,
|
||||
window_size=self.window_size,
|
||||
num_channels=self.num_channels,
|
||||
hidden_size=self.hidden_size,
|
||||
patch_stride=self.patch_stride,
|
||||
projection_dim=self.projection_dim,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_heads,
|
||||
intermediate_size=self.intermediate_size,
|
||||
dropout=self.dropout,
|
||||
attention_dropout=self.attention_dropout,
|
||||
initializer_range=self.initializer_range,
|
||||
spec_size=self.spec_size,
|
||||
freq_ratio=self.freq_ratio,
|
||||
patch_embeds_hidden_size=self.patch_embeds_hidden_size,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, input_features):
|
||||
model = ClapAudioModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
result = model(input_features)
|
||||
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
|
||||
|
||||
def create_and_check_model_with_projection(self, config, input_features):
|
||||
model = ClapAudioModelWithProjection(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
result = model(input_features)
|
||||
self.parent.assertEqual(result.audio_embeds.shape, (self.batch_size, self.projection_dim))
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, input_features = config_and_inputs
|
||||
inputs_dict = {"input_features": input_features}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class ClapAudioModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
"""
|
||||
Here we also overwrite some of the tests of test_modeling_common.py, as CLAP does not use input_ids, inputs_embeds,
|
||||
attention_mask and seq_length.
|
||||
"""
|
||||
|
||||
all_model_classes = (ClapAudioModel, ClapAudioModelWithProjection) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_resize_embeddings = False
|
||||
test_head_masking = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = ClapAudioModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=ClapAudioConfig, has_text_modality=False, hidden_size=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
@unittest.skip(reason="ClapAudioModel does not use inputs_embeds")
|
||||
def test_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
def test_model_common_attributes(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
|
||||
x = model.get_output_embeddings()
|
||||
self.assertTrue(x is None or isinstance(x, nn.Linear))
|
||||
|
||||
def test_hidden_states_output(self):
|
||||
def check_hidden_states_output(inputs_dict, config, model_class):
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
hidden_states = outputs.hidden_states
|
||||
|
||||
expected_num_layers = getattr(
|
||||
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
|
||||
)
|
||||
self.assertEqual(len(hidden_states), expected_num_layers)
|
||||
|
||||
self.assertListEqual(
|
||||
list(hidden_states[0].shape[-2:]),
|
||||
[self.model_tester.patch_embeds_hidden_size, self.model_tester.patch_embeds_hidden_size],
|
||||
)
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
# check that output_hidden_states also work using config
|
||||
del inputs_dict["output_hidden_states"]
|
||||
config.output_hidden_states = True
|
||||
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
@unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
|
||||
def test_retain_grad_hidden_states_attentions(self):
|
||||
pass
|
||||
|
||||
def test_forward_signature(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
signature = inspect.signature(model.forward)
|
||||
# signature.parameters is an OrderedDict => so arg_names order is deterministic
|
||||
arg_names = [*signature.parameters.keys()]
|
||||
|
||||
expected_arg_names = ["input_features"]
|
||||
self.assertListEqual(arg_names[:1], expected_arg_names)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_model_with_projection(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
|
||||
|
||||
@unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
|
||||
def test_training(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass")
|
||||
def test_training_gradient_checkpointing(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="ClapAudioModel has no base class and is not available in MODEL_MAPPING")
|
||||
def test_save_load_fast_init_from_base(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="ClapAudioModel has no base class and is not available in MODEL_MAPPING")
|
||||
def test_save_load_fast_init_to_base(self):
|
||||
pass
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
model = ClapAudioModel.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
@slow
|
||||
def test_model_with_projection_from_pretrained(self):
|
||||
for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
model = ClapAudioModelWithProjection.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
self.assertTrue(hasattr(model, "visual_projection"))
|
||||
|
||||
|
||||
class ClapTextModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=12,
|
||||
seq_length=7,
|
||||
is_training=True,
|
||||
use_input_mask=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
num_hidden_layers=5,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
max_position_embeddings=512,
|
||||
initializer_range=0.02,
|
||||
scope=None,
|
||||
projection_hidden_act="relu",
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.seq_length = seq_length
|
||||
self.is_training = is_training
|
||||
self.use_input_mask = use_input_mask
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.projection_dim = projection_dim
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.initializer_range = initializer_range
|
||||
self.scope = scope
|
||||
self.projection_hidden_act = projection_hidden_act
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
input_mask = None
|
||||
if self.use_input_mask:
|
||||
input_mask = random_attention_mask([self.batch_size, self.seq_length])
|
||||
|
||||
if input_mask is not None:
|
||||
batch_size, seq_length = input_mask.shape
|
||||
rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,))
|
||||
for batch_idx, start_index in enumerate(rnd_start_indices):
|
||||
input_mask[batch_idx, :start_index] = 1
|
||||
input_mask[batch_idx, start_index:] = 0
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
return config, input_ids, input_mask
|
||||
|
||||
def get_config(self):
|
||||
return ClapTextConfig(
|
||||
vocab_size=self.vocab_size,
|
||||
hidden_size=self.hidden_size,
|
||||
projection_dim=self.projection_dim,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
intermediate_size=self.intermediate_size,
|
||||
dropout=self.dropout,
|
||||
attention_dropout=self.attention_dropout,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
initializer_range=self.initializer_range,
|
||||
projection_hidden_act=self.projection_hidden_act,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, input_ids, input_mask):
|
||||
model = ClapTextModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
result = model(input_ids, attention_mask=input_mask)
|
||||
result = model(input_ids)
|
||||
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
|
||||
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
|
||||
|
||||
def create_and_check_model_with_projection(self, config, input_ids, input_mask):
|
||||
model = ClapTextModelWithProjection(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
result = model(input_ids, attention_mask=input_mask)
|
||||
result = model(input_ids)
|
||||
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
|
||||
self.parent.assertEqual(result.text_embeds.shape, (self.batch_size, self.projection_dim))
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, input_ids, input_mask = config_and_inputs
|
||||
inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class ClapTextModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (ClapTextModel, ClapTextModelWithProjection) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = ClapTextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=ClapTextConfig, hidden_size=37)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_model_with_projection(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model_with_projection(*config_and_inputs)
|
||||
|
||||
@unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass")
|
||||
def test_training(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass")
|
||||
def test_training_gradient_checkpointing(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="ClapTextModel does not use inputs_embeds")
|
||||
def test_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="ClapTextModel has no base class and is not available in MODEL_MAPPING")
|
||||
def test_save_load_fast_init_from_base(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="ClapTextModel has no base class and is not available in MODEL_MAPPING")
|
||||
def test_save_load_fast_init_to_base(self):
|
||||
pass
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
model = ClapTextModel.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
@slow
|
||||
def test_model_with_projection_from_pretrained(self):
|
||||
for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
model = ClapTextModelWithProjection.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
self.assertTrue(hasattr(model, "text_projection"))
|
||||
|
||||
|
||||
class ClapModelTester:
|
||||
def __init__(self, parent, text_kwargs=None, audio_kwargs=None, is_training=True):
|
||||
if text_kwargs is None:
|
||||
text_kwargs = {}
|
||||
if audio_kwargs is None:
|
||||
audio_kwargs = {}
|
||||
|
||||
self.parent = parent
|
||||
self.text_model_tester = ClapTextModelTester(parent, **text_kwargs)
|
||||
self.audio_model_tester = ClapAudioModelTester(parent, **audio_kwargs)
|
||||
self.is_training = is_training
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
_, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
|
||||
_, input_features = self.audio_model_tester.prepare_config_and_inputs()
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
return config, input_ids, attention_mask, input_features
|
||||
|
||||
def get_config(self):
|
||||
return ClapConfig.from_text_audio_configs(
|
||||
self.text_model_tester.get_config(), self.audio_model_tester.get_config(), projection_dim=64
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, input_ids, attention_mask, input_features):
|
||||
model = ClapModel(config).to(torch_device).eval()
|
||||
with torch.no_grad():
|
||||
result = model(input_ids, input_features, attention_mask)
|
||||
self.parent.assertEqual(
|
||||
result.logits_per_audio.shape, (self.audio_model_tester.batch_size, self.text_model_tester.batch_size)
|
||||
)
|
||||
self.parent.assertEqual(
|
||||
result.logits_per_text.shape, (self.text_model_tester.batch_size, self.audio_model_tester.batch_size)
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, input_ids, attention_mask, input_features = config_and_inputs
|
||||
inputs_dict = {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"input_features": input_features,
|
||||
"return_loss": True,
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class ClapModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (ClapModel,) if is_torch_available() else ()
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
test_resize_embeddings = False
|
||||
test_attention_outputs = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = ClapModelTester(self)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
@unittest.skip(reason="Hidden_states is tested in individual model tests")
|
||||
def test_hidden_states_output(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Inputs_embeds is tested in individual model tests")
|
||||
def test_inputs_embeds(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Retain_grad is tested in individual model tests")
|
||||
def test_retain_grad_hidden_states_attentions(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="ClapModel does not have input/output embeddings")
|
||||
def test_model_common_attributes(self):
|
||||
pass
|
||||
|
||||
# override as the `logit_scale` parameter initilization is different for CLAP
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
configs_no_init = _config_zero_init(config)
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
for name, param in model.named_parameters():
|
||||
if param.requires_grad:
|
||||
# check if `logit_scale` is initilized as per the original implementation
|
||||
if name == "logit_scale":
|
||||
self.assertAlmostEqual(
|
||||
param.data.item(),
|
||||
np.log(1 / 0.07),
|
||||
delta=1e-3,
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
else:
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
||||
def _create_and_check_torchscript(self, config, inputs_dict):
|
||||
if not self.test_torchscript:
|
||||
return
|
||||
|
||||
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
|
||||
configs_no_init.torchscript = True
|
||||
configs_no_init.return_dict = False
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
try:
|
||||
input_ids = inputs_dict["input_ids"]
|
||||
input_features = inputs_dict["input_features"] # CLAP needs input_features
|
||||
traced_model = torch.jit.trace(model, (input_ids, input_features))
|
||||
except RuntimeError:
|
||||
self.fail("Couldn't trace module.")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_name:
|
||||
pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
|
||||
|
||||
try:
|
||||
torch.jit.save(traced_model, pt_file_name)
|
||||
except Exception:
|
||||
self.fail("Couldn't save module.")
|
||||
|
||||
try:
|
||||
loaded_model = torch.jit.load(pt_file_name)
|
||||
except Exception:
|
||||
self.fail("Couldn't load module.")
|
||||
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
loaded_model.to(torch_device)
|
||||
loaded_model.eval()
|
||||
|
||||
model_state_dict = model.state_dict()
|
||||
loaded_model_state_dict = loaded_model.state_dict()
|
||||
|
||||
self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
|
||||
|
||||
models_equal = True
|
||||
for layer_name, p1 in model_state_dict.items():
|
||||
p2 = loaded_model_state_dict[layer_name]
|
||||
if p1.data.ne(p2.data).sum() > 0:
|
||||
models_equal = False
|
||||
|
||||
self.assertTrue(models_equal)
|
||||
|
||||
def test_load_audio_text_config(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
# Save ClapConfig and check if we can load ClapAudioConfig from it
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_name:
|
||||
config.save_pretrained(tmp_dir_name)
|
||||
audio_config = ClapAudioConfig.from_pretrained(tmp_dir_name)
|
||||
self.assertDictEqual(config.audio_config.to_dict(), audio_config.to_dict())
|
||||
|
||||
# Save ClapConfig and check if we can load ClapTextConfig from it
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_name:
|
||||
config.save_pretrained(tmp_dir_name)
|
||||
text_config = ClapTextConfig.from_pretrained(tmp_dir_name)
|
||||
self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in CLAP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
model = ClapModel.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch
|
||||
class ClapModelIntegrationTest(unittest.TestCase):
|
||||
paddings = ["repeatpad", "repeat", "pad"]
|
||||
|
||||
def test_integration_unfused(self):
|
||||
EXPECTED_MEANS_UNFUSED = {
|
||||
"repeatpad": 0.0024,
|
||||
"pad": 0.0020,
|
||||
"repeat": 0.0023,
|
||||
}
|
||||
|
||||
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
audio_sample = librispeech_dummy[-1]
|
||||
|
||||
model_id = "laion/clap-htsat-unfused"
|
||||
|
||||
model = ClapModel.from_pretrained(model_id).to(torch_device)
|
||||
processor = ClapProcessor.from_pretrained(model_id)
|
||||
|
||||
for padding in self.paddings:
|
||||
inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding).to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
audio_embed = model.get_audio_features(**inputs)
|
||||
expected_mean = EXPECTED_MEANS_UNFUSED[padding]
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
|
||||
)
|
||||
|
||||
def test_integration_fused(self):
|
||||
EXPECTED_MEANS_FUSED = {
|
||||
"repeatpad": 0.00069,
|
||||
"repeat": 0.00196,
|
||||
"pad": -0.000379,
|
||||
}
|
||||
|
||||
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
audio_sample = librispeech_dummy[-1]
|
||||
|
||||
model_id = "laion/clap-htsat-fused"
|
||||
|
||||
model = ClapModel.from_pretrained(model_id).to(torch_device)
|
||||
processor = ClapProcessor.from_pretrained(model_id)
|
||||
|
||||
for padding in self.paddings:
|
||||
inputs = processor(
|
||||
audios=audio_sample["audio"]["array"], return_tensors="pt", padding=padding, truncation="fusion"
|
||||
).to(torch_device)
|
||||
|
||||
audio_embed = model.get_audio_features(**inputs)
|
||||
expected_mean = EXPECTED_MEANS_FUSED[padding]
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(audio_embed.cpu().mean(), torch.tensor([expected_mean]), atol=1e-3, rtol=1e-3)
|
||||
)
|
||||
125
tests/models/clap/test_processor_clap.py
Normal file
125
tests/models/clap/test_processor_clap.py
Normal file
@@ -0,0 +1,125 @@
|
||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import ClapFeatureExtractor, ClapProcessor, RobertaTokenizer, RobertaTokenizerFast
|
||||
from transformers.testing_utils import require_sentencepiece, require_torchaudio
|
||||
|
||||
from .test_feature_extraction_clap import floats_list
|
||||
|
||||
|
||||
@require_torchaudio
|
||||
@require_sentencepiece
|
||||
class ClapProcessorTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.checkpoint = "laion/clap-htsat-unfused"
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return RobertaTokenizer.from_pretrained(self.checkpoint, **kwargs)
|
||||
|
||||
def get_feature_extractor(self, **kwargs):
|
||||
return ClapFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdirname)
|
||||
|
||||
def test_save_load_pretrained_default(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor = ClapProcessor.from_pretrained(self.tmpdirname)
|
||||
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||
self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
|
||||
|
||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
||||
self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
|
||||
|
||||
def test_save_load_pretrained_additional_features(self):
|
||||
processor = ClapProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
|
||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
|
||||
|
||||
processor = ClapProcessor.from_pretrained(
|
||||
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
|
||||
)
|
||||
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||
self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
|
||||
|
||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
||||
self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
|
||||
|
||||
def test_feature_extractor(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
raw_speech = floats_list((3, 1000))
|
||||
|
||||
input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
|
||||
input_processor = processor(audios=raw_speech, return_tensors="np")
|
||||
|
||||
for key in input_feat_extract.keys():
|
||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
def test_tokenizer(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
input_str = "This is a test string"
|
||||
|
||||
encoded_processor = processor(text=input_str)
|
||||
|
||||
encoded_tok = tokenizer(input_str)
|
||||
|
||||
for key in encoded_tok.keys():
|
||||
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
||||
|
||||
def test_tokenizer_decode(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||
|
||||
decoded_processor = processor.batch_decode(predicted_ids)
|
||||
decoded_tok = tokenizer.batch_decode(predicted_ids)
|
||||
|
||||
self.assertListEqual(decoded_tok, decoded_processor)
|
||||
|
||||
def test_model_input_names(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = ClapProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
self.assertListEqual(
|
||||
processor.model_input_names[2:],
|
||||
feature_extractor.model_input_names,
|
||||
msg="`processor` and `feature_extractor` model input names do not match",
|
||||
)
|
||||
Reference in New Issue
Block a user