From 232822f36d49598e68e152a9ca0a6d90be6f54b5 Mon Sep 17 00:00:00 2001 From: Anton Lozhkov Date: Wed, 27 Oct 2021 20:17:31 +0300 Subject: [PATCH] Add DistilHuBERT (#14174) * Add conversion * Rename * Add an integration test and remove layer_norm * Remove layer_norm from the converter * wording * Fix imports --- .../models/hubert/configuration_hubert.py | 4 + ...rt_original_s3prl_checkpoint_to_pytorch.py | 218 ++++++++++++++++++ .../models/hubert/modeling_hubert.py | 7 +- tests/test_modeling_hubert.py | 43 ++++ 4 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py index 30f47b5344..d663c458a9 100644 --- a/src/transformers/models/hubert/configuration_hubert.py +++ b/src/transformers/models/hubert/configuration_hubert.py @@ -70,6 +70,8 @@ class HubertConfig(PretrainedConfig): convolutional layers. feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0): The dropout probability for output of the feature extractor. + feat_proj_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to apply LayerNorm to the output of the feature extractor. feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported. @@ -149,6 +151,7 @@ class HubertConfig(PretrainedConfig): hidden_dropout=0.1, activation_dropout=0.1, attention_dropout=0.1, + feat_proj_layer_norm=True, feat_proj_dropout=0.0, final_dropout=0.1, layerdrop=0.1, @@ -195,6 +198,7 @@ class HubertConfig(PretrainedConfig): self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout self.activation_dropout = activation_dropout + self.feat_proj_layer_norm = feat_proj_layer_norm self.feat_proj_dropout = feat_proj_dropout self.final_dropout = final_dropout self.layerdrop = layerdrop diff --git a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py new file mode 100644 index 0000000000..c1963faa73 --- /dev/null +++ b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py @@ -0,0 +1,218 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Hubert checkpoint.""" + + +import argparse + +import torch + +from s3prl.hub import distilhubert +from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + +MAPPING = { + "post_extract_proj": "feature_projection.projection", + "encoder.pos_conv.0": "encoder.pos_conv_embed.conv", + "self_attn.k_proj": "encoder.layers.*.attention.k_proj", + "self_attn.v_proj": "encoder.layers.*.attention.v_proj", + "self_attn.q_proj": "encoder.layers.*.attention.q_proj", + "self_attn.out_proj": "encoder.layers.*.attention.out_proj", + "self_attn_layer_norm": "encoder.layers.*.layer_norm", + "fc1": "encoder.layers.*.feed_forward.intermediate_dense", + "fc2": "encoder.layers.*.feed_forward.output_dense", + "final_layer_norm": "encoder.layers.*.final_layer_norm", + "encoder.layer_norm": "encoder.layer_norm", + "mask_emb": "masked_spec_embed", +} + + +def set_recursively(hf_pointer, key, value, full_name, weight_type): + for attribute in key.split("."): + hf_pointer = getattr(hf_pointer, attribute) + + if weight_type is not None: + hf_shape = getattr(hf_pointer, weight_type).shape + else: + hf_shape = hf_pointer.shape + + assert ( + hf_shape == value.shape + ), f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be {value.shape} for {full_name}" + + if weight_type == "weight": + hf_pointer.weight.data = value + elif weight_type == "weight_g": + hf_pointer.weight_g.data = value + elif weight_type == "weight_v": + hf_pointer.weight_v.data = value + elif weight_type == "bias": + hf_pointer.bias.data = value + else: + hf_pointer.data = value + + logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.") + + +def recursively_load_weights(fairseq_model, hf_model): + unused_weights = [] + fairseq_dict = fairseq_model.state_dict() + + feature_extractor = hf_model.feature_extractor + + for name, value in fairseq_dict.items(): + is_used = False + if "conv_layers" in name: + load_conv_layer( + name, + value, + feature_extractor, + unused_weights, + hf_model.config.feat_extract_norm == "group", + ) + is_used = True + else: + for key, mapped_key in MAPPING.items(): + mapped_key = mapped_key + + if key in name: + is_used = True + if "*" in mapped_key: + layer_index = name.split(key)[0].split(".")[-2] + mapped_key = mapped_key.replace("*", layer_index) + if "weight_g" in name: + weight_type = "weight_g" + elif "weight_v" in name: + weight_type = "weight_v" + elif "weight" in name: + weight_type = "weight" + elif "bias" in name: + weight_type = "bias" + else: + weight_type = None + set_recursively(hf_model, mapped_key, value, name, weight_type) + continue + if not is_used: + unused_weights.append(name) + + logger.warning(f"Unused weights: {unused_weights}") + + +def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm): + name = full_name.split("conv_layers.")[-1] + items = name.split(".") + layer_id = int(items[0]) + type_id = int(items[1]) + + if type_id == 0: + if "bias" in name: + assert ( + value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape + ), f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found." + feature_extractor.conv_layers[layer_id].conv.bias.data = value + logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") + elif "weight" in name: + assert ( + value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape + ), f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found." + feature_extractor.conv_layers[layer_id].conv.weight.data = value + logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") + elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm): + if "bias" in name: + assert ( + value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape + ), f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was found." + feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value + logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") + elif "weight" in name: + assert ( + value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape + ), f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.weight.data.shape} was found." + feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value + logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") + else: + unused_weights.append(full_name) + + +def convert_config(model): + config = HubertConfig() + fs_config = model.config + + config.activation_dropout = fs_config.activation_dropout + config.apply_spec_augment = False + config.attention_dropout = fs_config.attention_dropout + config.conv_bias = False + conv_layers = eval(fs_config.extractor_conv_feature_layers) + config.conv_dim = [x[0] for x in conv_layers] + config.conv_kernel = [x[1] for x in conv_layers] + config.conv_stride = [x[2] for x in conv_layers] + config.feat_extract_activation = "gelu" + config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group" + config.feat_proj_layer_norm = False + config.feat_proj_dropout = 0.0 + config.final_dropout = 0.0 + config.hidden_act = fs_config.activation_fn + config.hidden_dropout = fs_config.dropout + config.hidden_size = fs_config.encoder_embed_dim + config.initializer_range = 0.02 + config.intermediate_size = fs_config.encoder_ffn_embed_dim + config.layer_norm_eps = 1e-5 + config.layerdrop = 0.0 + config.num_attention_heads = fs_config.encoder_attention_heads + config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups + config.num_conv_pos_embeddings = fs_config.conv_pos + config.num_feat_extract_layers = len(conv_layers) + config.num_hidden_layers = fs_config.encoder_layers + + return config + + +@torch.no_grad() +def convert_hubert_checkpoint(pytorch_dump_folder_path, config_path=None): + """ + Copy/paste/tweak model's weights to transformers design. + """ + model = distilhubert().model.model + + if config_path is not None: + config = HubertConfig.from_pretrained(config_path) + else: + config = convert_config(model) + model = model.eval() + + feature_extractor = Wav2Vec2FeatureExtractor( + feature_size=1, + sampling_rate=16000, + padding_value=0, + do_normalize=False, + return_attention_mask=False, + ) + hf_model = HubertModel(config) + + recursively_load_weights(model, hf_model) + + feature_extractor.save_pretrained(pytorch_dump_folder_path) + hf_model.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") + parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") + args = parser.parse_args() + convert_hubert_checkpoint(args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 8d7a46c90d..bd1da72d68 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -325,13 +325,16 @@ class HubertFeatureExtractor(nn.Module): class HubertFeatureProjection(nn.Module): def __init__(self, config): super().__init__() - self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps) + self.feat_proj_layer_norm = config.feat_proj_layer_norm + if self.feat_proj_layer_norm: + self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps) self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size) self.dropout = nn.Dropout(config.feat_proj_dropout) def forward(self, hidden_states): # non-projected hidden states are needed for quantization - hidden_states = self.layer_norm(hidden_states) + if self.feat_proj_layer_norm: + hidden_states = self.layer_norm(hidden_states) hidden_states = self.projection(hidden_states) hidden_states = self.dropout(hidden_states) return hidden_states diff --git a/tests/test_modeling_hubert.py b/tests/test_modeling_hubert.py index ad8a1dd206..6d69f6c923 100644 --- a/tests/test_modeling_hubert.py +++ b/tests/test_modeling_hubert.py @@ -760,3 +760,46 @@ class HubertModelIntegrationTest(unittest.TestCase): self.assertListEqual(predicted_ids.tolist(), expected_labels) # TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572 self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-1)) + + def test_inference_distilhubert(self): + model = HubertModel.from_pretrained("anton-l/distilhubert").to(torch_device) + processor = Wav2Vec2FeatureExtractor.from_pretrained("anton-l/distilhubert") + + # TODO: can't test on batched inputs due to incompatible padding https://github.com/pytorch/fairseq/pull/3572 + input_speech = self._load_datasamples(1) + + inputs = processor(input_speech, return_tensors="pt", padding=True) + + input_values = inputs.input_values.to(torch_device) + + with torch.no_grad(): + outputs = model(input_values).last_hidden_state + + # expected outputs taken from the original SEW implementation + expected_outputs_first = torch.tensor( + [ + [ + [-0.3505, 0.1167, 0.0608, 0.1294], + [-0.3085, 0.0481, 0.1106, 0.0955], + [-0.3107, -0.0391, 0.0739, 0.1360], + [-0.2385, -0.1795, -0.0928, 0.2389], + ] + ], + device=torch_device, + ) + expected_outputs_last = torch.tensor( + [ + [ + [-0.0732, 0.0255, 0.0529, -0.1372], + [-0.0812, 0.1259, 0.0564, -0.0438], + [-0.0054, 0.0758, -0.0002, -0.1617], + [0.0133, -0.0320, -0.0687, 0.0062], + ] + ], + device=torch_device, + ) + expected_output_sum = -3776.0730 + + self.assertTrue(torch.allclose(outputs[:, :4, :4], expected_outputs_first, atol=5e-3)) + self.assertTrue(torch.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=5e-3)) + self.assertTrue(abs(outputs.sum() - expected_output_sum) < 0.1)