Fix bug when requesting input normalization with EnCodec (#34756)
* EnCodec: unsqueeze padding mask * add test for normalization
This commit is contained in:
committed by
GitHub
parent
96bf3d6cc5
commit
f408d55448
@@ -576,7 +576,7 @@ class EncodecModel(EncodecPreTrainedModel):
|
|||||||
scale = None
|
scale = None
|
||||||
if self.config.normalize:
|
if self.config.normalize:
|
||||||
# if the padding is non zero
|
# if the padding is non zero
|
||||||
input_values = input_values * padding_mask
|
input_values = input_values * padding_mask.unsqueeze(1)
|
||||||
mono = torch.sum(input_values, 1, keepdim=True) / input_values.shape[1]
|
mono = torch.sum(input_values, 1, keepdim=True) / input_values.shape[1]
|
||||||
scale = mono.pow(2).mean(dim=-1, keepdim=True).sqrt() + 1e-8
|
scale = mono.pow(2).mean(dim=-1, keepdim=True).sqrt() + 1e-8
|
||||||
input_values = input_values / scale
|
input_values = input_values / scale
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin
|
|||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from transformers import EncodecModel
|
from transformers import EncodecFeatureExtractor, EncodecModel
|
||||||
|
|
||||||
|
|
||||||
def prepare_inputs_dict(
|
def prepare_inputs_dict(
|
||||||
@@ -111,6 +111,19 @@ class EncodecModelTester:
|
|||||||
|
|
||||||
return config, inputs_dict
|
return config, inputs_dict
|
||||||
|
|
||||||
|
def prepare_config_and_inputs_for_normalization(self):
|
||||||
|
input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0)
|
||||||
|
config = self.get_config()
|
||||||
|
config.normalize = True
|
||||||
|
|
||||||
|
processor = EncodecFeatureExtractor(feature_size=config.audio_channels, sampling_rate=config.sampling_rate)
|
||||||
|
input_values = list(input_values.cpu().numpy())
|
||||||
|
inputs_dict = processor(
|
||||||
|
input_values, sampling_rate=config.sampling_rate, padding=True, return_tensors="pt"
|
||||||
|
).to(torch_device)
|
||||||
|
|
||||||
|
return config, inputs_dict
|
||||||
|
|
||||||
def get_config(self):
|
def get_config(self):
|
||||||
return EncodecConfig(
|
return EncodecConfig(
|
||||||
audio_channels=self.num_channels,
|
audio_channels=self.num_channels,
|
||||||
@@ -125,9 +138,7 @@ class EncodecModelTester:
|
|||||||
|
|
||||||
def create_and_check_model_forward(self, config, inputs_dict):
|
def create_and_check_model_forward(self, config, inputs_dict):
|
||||||
model = EncodecModel(config=config).to(torch_device).eval()
|
model = EncodecModel(config=config).to(torch_device).eval()
|
||||||
|
result = model(**inputs_dict)
|
||||||
input_values = inputs_dict["input_values"]
|
|
||||||
result = model(input_values)
|
|
||||||
self.parent.assertEqual(
|
self.parent.assertEqual(
|
||||||
result.audio_values.shape, (self.batch_size, self.num_channels, self.intermediate_size)
|
result.audio_values.shape, (self.batch_size, self.num_channels, self.intermediate_size)
|
||||||
)
|
)
|
||||||
@@ -435,6 +446,10 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
|||||||
config.use_conv_shortcut = False
|
config.use_conv_shortcut = False
|
||||||
self.model_tester.create_and_check_model_forward(config, inputs_dict)
|
self.model_tester.create_and_check_model_forward(config, inputs_dict)
|
||||||
|
|
||||||
|
def test_model_forward_with_normalization(self):
|
||||||
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_normalization()
|
||||||
|
self.model_tester.create_and_check_model_forward(config, inputs_dict)
|
||||||
|
|
||||||
|
|
||||||
def normalize(arr):
|
def normalize(arr):
|
||||||
norm = np.linalg.norm(arr)
|
norm = np.linalg.norm(arr)
|
||||||
|
|||||||
Reference in New Issue
Block a user