From 7a1412e12b4755fcb1e40fa1a7243d5dc04d0693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20REMY?= Date: Mon, 7 Feb 2022 17:03:12 +0100 Subject: [PATCH] Wav2Vec2 models must either throw or deal with add_apater (#15409) * Wav2Vec2 models must either throw or deal with add_apater Co-authored-by: Patrick von Platen * Add pre-add_adapter backwards compatibility * Add pre-add_adapter backwards compatibility * Fix issue in tests/test_modeling_wav2vec2.py Co-authored-by: Patrick von Platen Co-authored-by: Patrick von Platen --- src/transformers/models/hubert/modeling_hubert.py | 9 ++++++++- src/transformers/models/sew/modeling_sew.py | 9 ++++++++- src/transformers/models/sew_d/modeling_sew_d.py | 9 ++++++++- .../models/unispeech/modeling_unispeech.py | 9 ++++++++- .../unispeech_sat/modeling_unispeech_sat.py | 13 ++++++++++++- .../models/wav2vec2/modeling_wav2vec2.py | 13 ++++++++++++- src/transformers/models/wavlm/modeling_wavlm.py | 13 ++++++++++++- tests/test_modeling_wav2vec2.py | 15 +++++++++++++++ 8 files changed, 83 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 85f53e7ada..928a742dee 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -1097,7 +1097,10 @@ class HubertForCTC(HubertPreTrainedModel): "instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. " "or define `vocab_size` of your model's configuration." ) - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) + output_hidden_size = ( + config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size + ) + self.lm_head = nn.Linear(output_hidden_size, config.vocab_size) # Initialize weights and apply final processing self.post_init() @@ -1215,6 +1218,10 @@ class HubertForSequenceClassification(HubertPreTrainedModel): def __init__(self, config): super().__init__(config) + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Sequence classification does not support the use of Hubert adapters (config.add_adapter=True)" + ) self.hubert = HubertModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index bed9dada32..6fd0a1861f 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -981,7 +981,10 @@ class SEWForCTC(SEWPreTrainedModel): "instantiate the model as follows: `SEWForCTC.from_pretrained(..., vocab_size=vocab_size)`. " "or define `vocab_size` of your model's configuration." ) - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) + output_hidden_size = ( + config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size + ) + self.lm_head = nn.Linear(output_hidden_size, config.vocab_size) # Initialize weights and apply final processing self.post_init() @@ -1099,6 +1102,10 @@ class SEWForSequenceClassification(SEWPreTrainedModel): def __init__(self, config): super().__init__(config) + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Sequence classification does not support the use of SEW adapters (config.add_adapter=True)" + ) self.sew = SEWModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 5923837cbb..af7dcba4b9 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1513,7 +1513,10 @@ class SEWDForCTC(SEWDPreTrainedModel): "instantiate the model as follows: `SEWDForCTC.from_pretrained(..., vocab_size=vocab_size)`. " "or define `vocab_size` of your model's configuration." ) - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) + output_hidden_size = ( + config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size + ) + self.lm_head = nn.Linear(output_hidden_size, config.vocab_size) # Initialize weights and apply final processing self.post_init() @@ -1631,6 +1634,10 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel): def __init__(self, config): super().__init__(config) + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Sequence classification does not support the use of SEWD adapters (config.add_adapter=True)" + ) self.sew_d = SEWDModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 6fa54a64d3..768fbfc609 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1374,7 +1374,10 @@ class UniSpeechForCTC(UniSpeechPreTrainedModel): "instantiate the model as follows: `UniSpeechForCTC.from_pretrained(..., vocab_size=vocab_size)`. " "or define `vocab_size` of your model's configuration." ) - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) + output_hidden_size = ( + config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size + ) + self.lm_head = nn.Linear(output_hidden_size, config.vocab_size) # Initialize weights and apply final processing self.post_init() @@ -1492,6 +1495,10 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel): def __init__(self, config): super().__init__(config) + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Sequence classification does not support the use of UniSpeech adapters (config.add_adapter=True)" + ) self.unispeech = UniSpeechModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 19bba7187e..ddc98c8001 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1402,7 +1402,10 @@ class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel): "instantiate the model as follows: `UniSpeechSatForCTC.from_pretrained(..., vocab_size=vocab_size)`. " "or define `vocab_size` of your model's configuration." ) - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) + output_hidden_size = ( + config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size + ) + self.lm_head = nn.Linear(output_hidden_size, config.vocab_size) # Initialize weights and apply final processing self.post_init() @@ -1520,6 +1523,10 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel): def __init__(self, config): super().__init__(config) + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Sequence classification does not support the use of UniSpeechSat adapters (config.add_adapter=True)" + ) self.unispeech_sat = UniSpeechSatModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: @@ -1640,6 +1647,10 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel): def __init__(self, config): super().__init__(config) + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Audio frame classification does not support the use of UniSpeechSat adapters (config.add_adapter=True)" + ) self.unispeech_sat = UniSpeechSatModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 7d4452c132..f45c0a8e68 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1702,7 +1702,10 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel): "instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. " "or define `vocab_size` of your model's configuration." ) - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) + output_hidden_size = ( + config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size + ) + self.lm_head = nn.Linear(output_hidden_size, config.vocab_size) # Initialize weights and apply final processing self.post_init() @@ -1819,6 +1822,10 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel): def __init__(self, config): super().__init__(config) + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Sequence classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)" + ) self.wav2vec2 = Wav2Vec2Model(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: @@ -1938,6 +1945,10 @@ class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel): def __init__(self, config): super().__init__(config) + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Audio frame classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)" + ) self.wav2vec2 = Wav2Vec2Model(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index de776c27bf..2618ec9bbc 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -1352,7 +1352,10 @@ class WavLMForCTC(WavLMPreTrainedModel): "instantiate the model as follows: `WavLMForCTC.from_pretrained(..., vocab_size=vocab_size)`. " "or define `vocab_size` of your model's configuration." ) - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) + output_hidden_size = ( + config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size + ) + self.lm_head = nn.Linear(output_hidden_size, config.vocab_size) # Initialize weights and apply final processing self.post_init() @@ -1470,6 +1473,10 @@ class WavLMForSequenceClassification(WavLMPreTrainedModel): def __init__(self, config): super().__init__(config) + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Sequence classification does not support the use of WavLM adapters (config.add_adapter=True)" + ) self.wavlm = WavLMModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: @@ -1590,6 +1597,10 @@ class WavLMForAudioFrameClassification(WavLMPreTrainedModel): def __init__(self, config): super().__init__(config) + if hasattr(config, "add_adapter") and config.add_adapter: + raise ValueError( + "Audio frame classification does not support the use of WavLM adapters (config.add_adapter=True)" + ) self.wavlm = WavLMModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings if config.use_weighted_layer_sum: diff --git a/tests/test_modeling_wav2vec2.py b/tests/test_modeling_wav2vec2.py index 9c07df5fed..b5e1bcd2cd 100644 --- a/tests/test_modeling_wav2vec2.py +++ b/tests/test_modeling_wav2vec2.py @@ -202,6 +202,17 @@ class Wav2Vec2ModelTester: result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size) ) + def create_and_check_model_with_adapter_for_ctc(self, config, input_values, attention_mask): + config.add_adapter = True + config.output_hidden_size = 2 * config.hidden_size + model = Wav2Vec2ForCTC(config=config) + model.to(torch_device) + model.eval() + result = model(input_values, attention_mask=attention_mask) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.adapter_output_seq_length, self.vocab_size) + ) + def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask): config.add_adapter = True config.output_hidden_size = 8 @@ -414,6 +425,10 @@ class Wav2Vec2ModelTest(ModelTesterMixin, unittest.TestCase): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_with_adapter(*config_and_inputs) + def test_model_with_adapter_for_ctc(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model_with_adapter_for_ctc(*config_and_inputs) + def test_model_with_adapter_proj_dim(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)