Add DistilHuBERT (#14174)
* Add conversion * Rename * Add an integration test and remove layer_norm * Remove layer_norm from the converter * wording * Fix imports
This commit is contained in:
@@ -70,6 +70,8 @@ class HubertConfig(PretrainedConfig):
|
|||||||
convolutional layers.
|
convolutional layers.
|
||||||
feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
The dropout probability for output of the feature extractor.
|
The dropout probability for output of the feature extractor.
|
||||||
|
feat_proj_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to apply LayerNorm to the output of the feature extractor.
|
||||||
feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
|
feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
|
||||||
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
||||||
extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
|
extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
|
||||||
@@ -149,6 +151,7 @@ class HubertConfig(PretrainedConfig):
|
|||||||
hidden_dropout=0.1,
|
hidden_dropout=0.1,
|
||||||
activation_dropout=0.1,
|
activation_dropout=0.1,
|
||||||
attention_dropout=0.1,
|
attention_dropout=0.1,
|
||||||
|
feat_proj_layer_norm=True,
|
||||||
feat_proj_dropout=0.0,
|
feat_proj_dropout=0.0,
|
||||||
final_dropout=0.1,
|
final_dropout=0.1,
|
||||||
layerdrop=0.1,
|
layerdrop=0.1,
|
||||||
@@ -195,6 +198,7 @@ class HubertConfig(PretrainedConfig):
|
|||||||
self.hidden_dropout = hidden_dropout
|
self.hidden_dropout = hidden_dropout
|
||||||
self.attention_dropout = attention_dropout
|
self.attention_dropout = attention_dropout
|
||||||
self.activation_dropout = activation_dropout
|
self.activation_dropout = activation_dropout
|
||||||
|
self.feat_proj_layer_norm = feat_proj_layer_norm
|
||||||
self.feat_proj_dropout = feat_proj_dropout
|
self.feat_proj_dropout = feat_proj_dropout
|
||||||
self.final_dropout = final_dropout
|
self.final_dropout = final_dropout
|
||||||
self.layerdrop = layerdrop
|
self.layerdrop = layerdrop
|
||||||
|
|||||||
@@ -0,0 +1,218 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2021 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Convert Hubert checkpoint."""
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from s3prl.hub import distilhubert
|
||||||
|
from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging
|
||||||
|
|
||||||
|
|
||||||
|
logging.set_verbosity_info()
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
MAPPING = {
|
||||||
|
"post_extract_proj": "feature_projection.projection",
|
||||||
|
"encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
|
||||||
|
"self_attn.k_proj": "encoder.layers.*.attention.k_proj",
|
||||||
|
"self_attn.v_proj": "encoder.layers.*.attention.v_proj",
|
||||||
|
"self_attn.q_proj": "encoder.layers.*.attention.q_proj",
|
||||||
|
"self_attn.out_proj": "encoder.layers.*.attention.out_proj",
|
||||||
|
"self_attn_layer_norm": "encoder.layers.*.layer_norm",
|
||||||
|
"fc1": "encoder.layers.*.feed_forward.intermediate_dense",
|
||||||
|
"fc2": "encoder.layers.*.feed_forward.output_dense",
|
||||||
|
"final_layer_norm": "encoder.layers.*.final_layer_norm",
|
||||||
|
"encoder.layer_norm": "encoder.layer_norm",
|
||||||
|
"mask_emb": "masked_spec_embed",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def set_recursively(hf_pointer, key, value, full_name, weight_type):
|
||||||
|
for attribute in key.split("."):
|
||||||
|
hf_pointer = getattr(hf_pointer, attribute)
|
||||||
|
|
||||||
|
if weight_type is not None:
|
||||||
|
hf_shape = getattr(hf_pointer, weight_type).shape
|
||||||
|
else:
|
||||||
|
hf_shape = hf_pointer.shape
|
||||||
|
|
||||||
|
assert (
|
||||||
|
hf_shape == value.shape
|
||||||
|
), f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be {value.shape} for {full_name}"
|
||||||
|
|
||||||
|
if weight_type == "weight":
|
||||||
|
hf_pointer.weight.data = value
|
||||||
|
elif weight_type == "weight_g":
|
||||||
|
hf_pointer.weight_g.data = value
|
||||||
|
elif weight_type == "weight_v":
|
||||||
|
hf_pointer.weight_v.data = value
|
||||||
|
elif weight_type == "bias":
|
||||||
|
hf_pointer.bias.data = value
|
||||||
|
else:
|
||||||
|
hf_pointer.data = value
|
||||||
|
|
||||||
|
logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
|
||||||
|
|
||||||
|
|
||||||
|
def recursively_load_weights(fairseq_model, hf_model):
|
||||||
|
unused_weights = []
|
||||||
|
fairseq_dict = fairseq_model.state_dict()
|
||||||
|
|
||||||
|
feature_extractor = hf_model.feature_extractor
|
||||||
|
|
||||||
|
for name, value in fairseq_dict.items():
|
||||||
|
is_used = False
|
||||||
|
if "conv_layers" in name:
|
||||||
|
load_conv_layer(
|
||||||
|
name,
|
||||||
|
value,
|
||||||
|
feature_extractor,
|
||||||
|
unused_weights,
|
||||||
|
hf_model.config.feat_extract_norm == "group",
|
||||||
|
)
|
||||||
|
is_used = True
|
||||||
|
else:
|
||||||
|
for key, mapped_key in MAPPING.items():
|
||||||
|
mapped_key = mapped_key
|
||||||
|
|
||||||
|
if key in name:
|
||||||
|
is_used = True
|
||||||
|
if "*" in mapped_key:
|
||||||
|
layer_index = name.split(key)[0].split(".")[-2]
|
||||||
|
mapped_key = mapped_key.replace("*", layer_index)
|
||||||
|
if "weight_g" in name:
|
||||||
|
weight_type = "weight_g"
|
||||||
|
elif "weight_v" in name:
|
||||||
|
weight_type = "weight_v"
|
||||||
|
elif "weight" in name:
|
||||||
|
weight_type = "weight"
|
||||||
|
elif "bias" in name:
|
||||||
|
weight_type = "bias"
|
||||||
|
else:
|
||||||
|
weight_type = None
|
||||||
|
set_recursively(hf_model, mapped_key, value, name, weight_type)
|
||||||
|
continue
|
||||||
|
if not is_used:
|
||||||
|
unused_weights.append(name)
|
||||||
|
|
||||||
|
logger.warning(f"Unused weights: {unused_weights}")
|
||||||
|
|
||||||
|
|
||||||
|
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
|
||||||
|
name = full_name.split("conv_layers.")[-1]
|
||||||
|
items = name.split(".")
|
||||||
|
layer_id = int(items[0])
|
||||||
|
type_id = int(items[1])
|
||||||
|
|
||||||
|
if type_id == 0:
|
||||||
|
if "bias" in name:
|
||||||
|
assert (
|
||||||
|
value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape
|
||||||
|
), f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
|
||||||
|
feature_extractor.conv_layers[layer_id].conv.bias.data = value
|
||||||
|
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
|
||||||
|
elif "weight" in name:
|
||||||
|
assert (
|
||||||
|
value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape
|
||||||
|
), f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
|
||||||
|
feature_extractor.conv_layers[layer_id].conv.weight.data = value
|
||||||
|
logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
|
||||||
|
elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
|
||||||
|
if "bias" in name:
|
||||||
|
assert (
|
||||||
|
value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape
|
||||||
|
), f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was found."
|
||||||
|
feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
|
||||||
|
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
|
||||||
|
elif "weight" in name:
|
||||||
|
assert (
|
||||||
|
value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape
|
||||||
|
), f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
|
||||||
|
feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
|
||||||
|
logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
|
||||||
|
else:
|
||||||
|
unused_weights.append(full_name)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_config(model):
|
||||||
|
config = HubertConfig()
|
||||||
|
fs_config = model.config
|
||||||
|
|
||||||
|
config.activation_dropout = fs_config.activation_dropout
|
||||||
|
config.apply_spec_augment = False
|
||||||
|
config.attention_dropout = fs_config.attention_dropout
|
||||||
|
config.conv_bias = False
|
||||||
|
conv_layers = eval(fs_config.extractor_conv_feature_layers)
|
||||||
|
config.conv_dim = [x[0] for x in conv_layers]
|
||||||
|
config.conv_kernel = [x[1] for x in conv_layers]
|
||||||
|
config.conv_stride = [x[2] for x in conv_layers]
|
||||||
|
config.feat_extract_activation = "gelu"
|
||||||
|
config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group"
|
||||||
|
config.feat_proj_layer_norm = False
|
||||||
|
config.feat_proj_dropout = 0.0
|
||||||
|
config.final_dropout = 0.0
|
||||||
|
config.hidden_act = fs_config.activation_fn
|
||||||
|
config.hidden_dropout = fs_config.dropout
|
||||||
|
config.hidden_size = fs_config.encoder_embed_dim
|
||||||
|
config.initializer_range = 0.02
|
||||||
|
config.intermediate_size = fs_config.encoder_ffn_embed_dim
|
||||||
|
config.layer_norm_eps = 1e-5
|
||||||
|
config.layerdrop = 0.0
|
||||||
|
config.num_attention_heads = fs_config.encoder_attention_heads
|
||||||
|
config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups
|
||||||
|
config.num_conv_pos_embeddings = fs_config.conv_pos
|
||||||
|
config.num_feat_extract_layers = len(conv_layers)
|
||||||
|
config.num_hidden_layers = fs_config.encoder_layers
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def convert_hubert_checkpoint(pytorch_dump_folder_path, config_path=None):
|
||||||
|
"""
|
||||||
|
Copy/paste/tweak model's weights to transformers design.
|
||||||
|
"""
|
||||||
|
model = distilhubert().model.model
|
||||||
|
|
||||||
|
if config_path is not None:
|
||||||
|
config = HubertConfig.from_pretrained(config_path)
|
||||||
|
else:
|
||||||
|
config = convert_config(model)
|
||||||
|
model = model.eval()
|
||||||
|
|
||||||
|
feature_extractor = Wav2Vec2FeatureExtractor(
|
||||||
|
feature_size=1,
|
||||||
|
sampling_rate=16000,
|
||||||
|
padding_value=0,
|
||||||
|
do_normalize=False,
|
||||||
|
return_attention_mask=False,
|
||||||
|
)
|
||||||
|
hf_model = HubertModel(config)
|
||||||
|
|
||||||
|
recursively_load_weights(model, hf_model)
|
||||||
|
|
||||||
|
feature_extractor.save_pretrained(pytorch_dump_folder_path)
|
||||||
|
hf_model.save_pretrained(pytorch_dump_folder_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
|
||||||
|
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
|
||||||
|
args = parser.parse_args()
|
||||||
|
convert_hubert_checkpoint(args.pytorch_dump_folder_path, args.config_path)
|
||||||
@@ -325,13 +325,16 @@ class HubertFeatureExtractor(nn.Module):
|
|||||||
class HubertFeatureProjection(nn.Module):
|
class HubertFeatureProjection(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
|
self.feat_proj_layer_norm = config.feat_proj_layer_norm
|
||||||
|
if self.feat_proj_layer_norm:
|
||||||
|
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
|
||||||
self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
|
self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
|
||||||
self.dropout = nn.Dropout(config.feat_proj_dropout)
|
self.dropout = nn.Dropout(config.feat_proj_dropout)
|
||||||
|
|
||||||
def forward(self, hidden_states):
|
def forward(self, hidden_states):
|
||||||
# non-projected hidden states are needed for quantization
|
# non-projected hidden states are needed for quantization
|
||||||
hidden_states = self.layer_norm(hidden_states)
|
if self.feat_proj_layer_norm:
|
||||||
|
hidden_states = self.layer_norm(hidden_states)
|
||||||
hidden_states = self.projection(hidden_states)
|
hidden_states = self.projection(hidden_states)
|
||||||
hidden_states = self.dropout(hidden_states)
|
hidden_states = self.dropout(hidden_states)
|
||||||
return hidden_states
|
return hidden_states
|
||||||
|
|||||||
@@ -760,3 +760,46 @@ class HubertModelIntegrationTest(unittest.TestCase):
|
|||||||
self.assertListEqual(predicted_ids.tolist(), expected_labels)
|
self.assertListEqual(predicted_ids.tolist(), expected_labels)
|
||||||
# TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
|
# TODO: lower the tolerance after merging the padding fix https://github.com/pytorch/fairseq/pull/3572
|
||||||
self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-1))
|
self.assertTrue(torch.allclose(predicted_logits, expected_logits, atol=1e-1))
|
||||||
|
|
||||||
|
def test_inference_distilhubert(self):
|
||||||
|
model = HubertModel.from_pretrained("anton-l/distilhubert").to(torch_device)
|
||||||
|
processor = Wav2Vec2FeatureExtractor.from_pretrained("anton-l/distilhubert")
|
||||||
|
|
||||||
|
# TODO: can't test on batched inputs due to incompatible padding https://github.com/pytorch/fairseq/pull/3572
|
||||||
|
input_speech = self._load_datasamples(1)
|
||||||
|
|
||||||
|
inputs = processor(input_speech, return_tensors="pt", padding=True)
|
||||||
|
|
||||||
|
input_values = inputs.input_values.to(torch_device)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(input_values).last_hidden_state
|
||||||
|
|
||||||
|
# expected outputs taken from the original SEW implementation
|
||||||
|
expected_outputs_first = torch.tensor(
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[-0.3505, 0.1167, 0.0608, 0.1294],
|
||||||
|
[-0.3085, 0.0481, 0.1106, 0.0955],
|
||||||
|
[-0.3107, -0.0391, 0.0739, 0.1360],
|
||||||
|
[-0.2385, -0.1795, -0.0928, 0.2389],
|
||||||
|
]
|
||||||
|
],
|
||||||
|
device=torch_device,
|
||||||
|
)
|
||||||
|
expected_outputs_last = torch.tensor(
|
||||||
|
[
|
||||||
|
[
|
||||||
|
[-0.0732, 0.0255, 0.0529, -0.1372],
|
||||||
|
[-0.0812, 0.1259, 0.0564, -0.0438],
|
||||||
|
[-0.0054, 0.0758, -0.0002, -0.1617],
|
||||||
|
[0.0133, -0.0320, -0.0687, 0.0062],
|
||||||
|
]
|
||||||
|
],
|
||||||
|
device=torch_device,
|
||||||
|
)
|
||||||
|
expected_output_sum = -3776.0730
|
||||||
|
|
||||||
|
self.assertTrue(torch.allclose(outputs[:, :4, :4], expected_outputs_first, atol=5e-3))
|
||||||
|
self.assertTrue(torch.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=5e-3))
|
||||||
|
self.assertTrue(abs(outputs.sum() - expected_output_sum) < 0.1)
|
||||||
|
|||||||
Reference in New Issue
Block a user