Add MusicGen Melody (#28819)

* first modeling code

* make repository

* still WIP

* update model

* add tests

* add latest change

* clean docstrings and copied from

* update docstrings md and readme

* correct chroma function

* correct copied from and remove unreleated test

* add doc to toctree

* correct imports

* add convert script to notdoctested

* Add suggestion from Sanchit

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* correct get_uncoditional_inputs docstrings

* modify README according to SANCHIT feedback

* add chroma to audio utils

* clean librosa and torchaudio hard dependencies

* fix FE

* refactor audio decoder -> audio encoder for consistency with previous musicgen

* refactor conditional -> encoder

* modify sampling rate logics

* modify license at the beginning

* refactor all_self_attns->all_attentions

* remove ignore copy from causallm generate

* add copied from for from_sub_models

* fix make copies

* add warning if audio is truncated

* add copied from where relevant

* remove artefact

* fix convert script

* fix torchaudio and FE

* modify chroma method according to feedback-> better naming

* refactor input_values->input_features

* refactor input_values->input_features and fix import fe

* add input_features to docstrigs

* correct inputs_embeds logics

* remove dtype conversion

* refactor _prepare_conditional_hidden_states_kwargs_for_generation ->_prepare_encoder_hidden_states_kwargs_for_generation

* change warning for chroma length

* Update src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* change way to save wav, using soundfile

* correct docs and change to soundfile

* fix import

* fix init proj layers

* remove line breaks from md

* fix issue with docstrings

* add FE suggestions

* improve is in logics and remove useless imports

* remove custom from_pretrained

* simplify docstring code

* add suggestions for modeling tests

* make style

* update converting script with sanity check

* remove encoder attention mask from conditional generation

* replace musicgen melody checkpoints with official orga

* rename ylacombe->facebook in checkpoints

* fix copies

* remove unecessary warning

* add shape in code docstrings

* add files to slow doc tests

* fix md bug and add md to not_tested

* make fix-copies

* fix hidden states test and batching

---------

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
This commit is contained in:
Yoach Lacombe
2024-03-18 13:06:12 +00:00
committed by GitHub
parent bf3dfd1160
commit c43b380e70
40 changed files with 5947 additions and 3 deletions

View File

@@ -20,6 +20,7 @@ import pytest
from transformers.audio_utils import (
amplitude_to_db,
chroma_filter_bank,
hertz_to_mel,
mel_filter_bank,
mel_to_hertz,
@@ -27,6 +28,11 @@ from transformers.audio_utils import (
spectrogram,
window_function,
)
from transformers.testing_utils import is_librosa_available, require_librosa
if is_librosa_available():
from librosa.filters import chroma
class AudioUtilsFunctionTester(unittest.TestCase):
@@ -755,3 +761,57 @@ class AudioUtilsFunctionTester(unittest.TestCase):
amplitude_to_db(spectrogram, min_value=0.0)
with pytest.raises(ValueError):
amplitude_to_db(spectrogram, db_range=-80)
@require_librosa
def test_chroma_equivalence(self):
num_frequency_bins = 25
num_chroma = 6
sampling_rate = 24000
# test default parameters
original_chroma = chroma(sr=sampling_rate, n_chroma=num_chroma, n_fft=num_frequency_bins)
utils_chroma = chroma_filter_bank(
num_frequency_bins=num_frequency_bins, num_chroma=num_chroma, sampling_rate=sampling_rate
)
self.assertTrue(np.allclose(original_chroma, utils_chroma))
# test no weighting_parameters
original_chroma = chroma(sr=sampling_rate, n_chroma=num_chroma, n_fft=num_frequency_bins, octwidth=None)
utils_chroma = chroma_filter_bank(
num_frequency_bins=num_frequency_bins,
num_chroma=num_chroma,
sampling_rate=sampling_rate,
weighting_parameters=None,
)
self.assertTrue(np.allclose(original_chroma, utils_chroma))
# test with L1 norm
original_chroma = chroma(sr=sampling_rate, n_chroma=num_chroma, n_fft=num_frequency_bins, norm=1.0)
utils_chroma = chroma_filter_bank(
num_frequency_bins=num_frequency_bins, num_chroma=num_chroma, sampling_rate=sampling_rate, power=1.0
)
self.assertTrue(np.allclose(original_chroma, utils_chroma))
# test starting at 'A' chroma, power = None, tuning = 0, different weighting_parameters
original_chroma = chroma(
sr=sampling_rate,
n_chroma=num_chroma,
n_fft=num_frequency_bins,
norm=None,
base_c=None,
octwidth=1.0,
ctroct=4.0,
)
utils_chroma = chroma_filter_bank(
num_frequency_bins=num_frequency_bins,
num_chroma=num_chroma,
sampling_rate=sampling_rate,
power=None,
start_at_c_chroma=False,
weighting_parameters=(4.0, 1.0),
)
self.assertTrue(np.allclose(original_chroma, utils_chroma))