diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py index 9339c26453..6fe658a705 100644 --- a/src/transformers/models/encodec/modeling_encodec.py +++ b/src/transformers/models/encodec/modeling_encodec.py @@ -576,7 +576,7 @@ class EncodecModel(EncodecPreTrainedModel): scale = None if self.config.normalize: # if the padding is non zero - input_values = input_values * padding_mask + input_values = input_values * padding_mask.unsqueeze(1) mono = torch.sum(input_values, 1, keepdim=True) / input_values.shape[1] scale = mono.pow(2).mean(dim=-1, keepdim=True).sqrt() + 1e-8 input_values = input_values / scale diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py index 2aac4dba82..47931cbe45 100644 --- a/tests/models/encodec/test_modeling_encodec.py +++ b/tests/models/encodec/test_modeling_encodec.py @@ -39,7 +39,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin if is_torch_available(): import torch - from transformers import EncodecModel + from transformers import EncodecFeatureExtractor, EncodecModel def prepare_inputs_dict( @@ -111,6 +111,19 @@ class EncodecModelTester: return config, inputs_dict + def prepare_config_and_inputs_for_normalization(self): + input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0) + config = self.get_config() + config.normalize = True + + processor = EncodecFeatureExtractor(feature_size=config.audio_channels, sampling_rate=config.sampling_rate) + input_values = list(input_values.cpu().numpy()) + inputs_dict = processor( + input_values, sampling_rate=config.sampling_rate, padding=True, return_tensors="pt" + ).to(torch_device) + + return config, inputs_dict + def get_config(self): return EncodecConfig( audio_channels=self.num_channels, @@ -125,9 +138,7 @@ class EncodecModelTester: def create_and_check_model_forward(self, config, inputs_dict): model = EncodecModel(config=config).to(torch_device).eval() - - input_values = inputs_dict["input_values"] - result = model(input_values) + result = model(**inputs_dict) self.parent.assertEqual( result.audio_values.shape, (self.batch_size, self.num_channels, self.intermediate_size) ) @@ -435,6 +446,10 @@ class EncodecModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) config.use_conv_shortcut = False self.model_tester.create_and_check_model_forward(config, inputs_dict) + def test_model_forward_with_normalization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_normalization() + self.model_tester.create_and_check_model_forward(config, inputs_dict) + def normalize(arr): norm = np.linalg.norm(arr)