Wav2Vec2 meets phonemes (#14353)

* up * add tokenizer * improve more * finish tokenizer * finish * adapt speech recognition script * adapt convert * more fixes * more fixes * update phonemizer wav2vec2 * better naming * fix more tests * more fixes swedish * correct tests * finish * improve script * remove file * up * lets get those 100 model architectures until the end of the month * make fix-copies * correct more * correct script * more fixes * more fixes * add to docs * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * replace assert * fix copies * fix docs * new try docs * boom boom * update * add phonemizer to audio tests * make fix-copies * up * upload models * some changes * Update tests/test_tokenization_wav2vec2_phoneme.py Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com> * more fixes * remove @ Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>
2021-12-17 19:56:44 +01:00
parent 77d6c826d8
commit c4a96cecbc
26 changed files with 1296 additions and 151 deletions
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -21,6 +21,7 @@ import logging
 import os
 import re
 import sys
+import warnings
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union

@@ -34,6 +35,7 @@ from transformers import (
    AutoConfig,
    AutoFeatureExtractor,
    AutoModelForCTC,
+    AutoProcessor,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
@@ -68,6 +70,10 @@ class ModelArguments:
    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
+    tokenizer_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
+    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
@@ -191,7 +197,7 @@ class DataTrainingArguments:
    max_duration_in_seconds: Optional[float] = field(
        default=20.0,
        metadata={
-            "help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
+            "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
        },
    )
    min_duration_in_seconds: Optional[float] = field(
@@ -210,7 +216,28 @@ class DataTrainingArguments:
        default=False,
        metadata={
            "help": "If :obj:`True`, will use the token generated when running"
-            ":obj:`transformers-cli logiin as HTTP bearer authorization for remote files."
+            ":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
+        },
+    )
+    unk_token: Optional[str] = field(
+        default="[UNK]",
+        metadata={"help": "The unk token for the tokenizer"},
+    )
+    pad_token: Optional[str] = field(
+        default="[PAD]",
+        metadata={"help": "The padding token for the tokenizer"},
+    )
+    word_delimiter_token: Optional[str] = field(
+        default="|",
+        metadata={"help": "The word delimiter token for the tokenizer"},
+    )
+    phoneme_language: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The target language that should be used be"
+            " passed to the tokenizer for tokenization. Note that"
+            " this is only relevant if the model classifies the"
+            " input audio to a sequence of phoneme sequences."
        },
    )

@@ -220,7 +247,7 @@ class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
-        processor (:class:`~transformers.Wav2Vec2Processor`)
+        processor (:class:`~transformers.AutoProcessor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
@@ -241,7 +268,7 @@ class DataCollatorCTCWithPadding:
            7.5 (Volta).
    """

-    processor: Wav2Vec2Processor
+    processor: AutoProcessor
    padding: Union[bool, str] = "longest"
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None
@@ -275,7 +302,12 @@ class DataCollatorCTCWithPadding:
        return batch


-def create_vocabulary_from_data(datasets: DatasetDict):
+def create_vocabulary_from_data(
+    datasets: DatasetDict,
+    word_delimiter_token: Optional[str] = None,
+    unk_token: Optional[str] = None,
+    pad_token: Optional[str] = None,
+):
    # Given training and test labels create vocabulary
    def extract_all_chars(batch):
        all_text = " ".join(batch["target_text"])
@@ -298,12 +330,16 @@ def create_vocabulary_from_data(datasets: DatasetDict):
    vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}

    # replace white space with delimiter token
-    vocab_dict["|"] = vocab_dict[" "]
-    del vocab_dict[" "]
+    if word_delimiter_token is not None:
+        vocab_dict[word_delimiter_token] = vocab_dict[" "]
+        del vocab_dict[" "]

    # add unk and pad token
-    vocab_dict["[UNK]"] = len(vocab_dict)
-    vocab_dict["[PAD]"] = len(vocab_dict)
+    if unk_token is not None:
+        vocab_dict[unk_token] = len(vocab_dict)
+
+    if pad_token is not None:
+        vocab_dict[pad_token] = len(vocab_dict)

    return vocab_dict

@@ -359,103 +395,126 @@ def main():

    # 1. First, let's load the dataset
    raw_datasets = DatasetDict()
-    raw_datasets["train"] = load_dataset(
-        data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
-    )
-    raw_datasets["eval"] = load_dataset(
-        data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name
-    )

-    if data_args.audio_column_name not in raw_datasets["train"].column_names:
-        raise ValueError(
-            f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
-            "Make sure to set `--audio_column_name` to the correct audio column - one of "
-            f"{', '.join(raw_datasets['train'].column_names)}."
+    if training_args.do_train:
+        raw_datasets["train"] = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
        )

-    if data_args.text_column_name not in raw_datasets["train"].column_names:
-        raise ValueError(
-            f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
-            "Make sure to set `--text_column_name` to the correct text column - one of "
-            f"{', '.join(raw_datasets['train'].column_names)}."
+        if data_args.audio_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
+                "Make sure to set `--audio_column_name` to the correct audio column - one of "
+                f"{', '.join(raw_datasets['train'].column_names)}."
+            )
+
+        if data_args.text_column_name not in raw_datasets["train"].column_names:
+            raise ValueError(
+                f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
+                "Make sure to set `--text_column_name` to the correct text column - one of "
+                f"{', '.join(raw_datasets['train'].column_names)}."
+            )
+
+        if data_args.max_train_samples is not None:
+            raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
+
+    if training_args.do_eval:
+        raw_datasets["eval"] = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name
        )

-    # prepare dataset
-    if data_args.max_train_samples is not None:
-        raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
-
-    if data_args.max_eval_samples is not None:
-        raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
+        if data_args.max_eval_samples is not None:
+            raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))

    # 2. We remove some special characters from the datasets
    # that make training complicated and do not help in transcribing the speech
    # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
    # that could be easily picked up by the model
-
    chars_to_ignore_regex = (
        f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
    )
+    text_column_name = data_args.text_column_name

    def remove_special_characters(batch):
        if chars_to_ignore_regex is not None:
-            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[data_args.text_column_name]).lower() + " "
+            batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
        else:
-            batch["target_text"] = batch[data_args.text_column_name].lower() + " "
+            batch["target_text"] = batch[text_column_name].lower() + " "
        return batch

    with training_args.main_process_first(desc="dataset map special characters removal"):
        raw_datasets = raw_datasets.map(
            remove_special_characters,
-            remove_columns=[data_args.text_column_name],
+            remove_columns=[text_column_name],
            desc="remove special characters from datasets",
        )

-    # 3. Next, we create the vocabulary of the model by extracting all unique characters from
-    # the training and evaluation datasets
-    # We need to make sure that only first rank saves vocabulary
-    # make sure all processes wait until vocab is created
-    vocab_file = os.path.join(training_args.output_dir, "vocab.json")
-
-    with training_args.main_process_first():
-        if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
-            os.remove(vocab_file)
-
-    with training_args.main_process_first(desc="dataset map vocabulary creation"):
-        if not os.path.isfile(vocab_file):
-            os.makedirs(training_args.output_dir, exist_ok=True)
-            vocab_dict = create_vocabulary_from_data(raw_datasets)
-
-            # save vocab dict to be loaded into tokenizer
-            with open(vocab_file, "w") as file:
-                json.dump(vocab_dict, file)
-
-    # 4. Now we can instantiate the configuration, feature extractor, tokenizer and model
-    # Note for distributed training, the .from_pretrained methods guarantee that only
-    # one local process can concurrently download model & vocab.
+    # save special tokens for tokenizer
+    word_delimiter_token = data_args.word_delimiter_token
+    unk_token = data_args.unk_token
+    pad_token = data_args.pad_token

+    # 3. Next, let's load the config as we might need it to create
+    # the tokenizer
    # load config
    config = AutoConfig.from_pretrained(
        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
    )

-    # tokenizer is defined by `tokenizer_class` if present in config else by `model_type`
-    config_for_tokenizer = config if config.tokenizer_class is not None else None
-    tokenizer_type = config.model_type if config.tokenizer_class is None else None
+    # 4. Next, if no tokenizer file is defined,
+    # we create the vocabulary of the model by extracting all unique characters from
+    # the training and evaluation datasets
+    # We need to make sure that only first rank saves vocabulary
+    # make sure all processes wait until vocab is created
+    tokenizer_name_or_path = model_args.tokenizer_name_or_path
+    tokenizer_type_hints = {}
+    if tokenizer_name_or_path is None:
+        # save vocab in training output dir
+        tokenizer_name_or_path = training_args.output_dir

-    # load feature_extractor, tokenizer and create processor
+        vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
+
+        with training_args.main_process_first():
+            if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
+                os.remove(vocab_file)
+
+        with training_args.main_process_first(desc="dataset map vocabulary creation"):
+            if not os.path.isfile(vocab_file):
+                os.makedirs(tokenizer_name_or_path, exist_ok=True)
+                vocab_dict = create_vocabulary_from_data(
+                    raw_datasets,
+                    word_delimiter_token=word_delimiter_token,
+                    unk_token=unk_token,
+                    pad_token=pad_token,
+                )
+
+                # save vocab dict to be loaded into tokenizer
+                with open(vocab_file, "w") as file:
+                    json.dump(vocab_dict, file)
+
+        # if tokenizer has just been created
+        # it is defined by `tokenizer_class` if present in config else by `model_type`
+        tokenizer_type_hints = {
+            "config": config if config.tokenizer_class is not None else None,
+            "tokenizer_type": config.model_type if config.tokenizer_class is None else None,
+        }
+
+    # 5. Now we can instantiate the feature extractor, tokenizer and model
+    # Note for distributed training, the .from_pretrained methods guarantee that only
+    # one local process can concurrently download model & vocab.
+
+    # load feature_extractor and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
-        training_args.output_dir,
-        config=config_for_tokenizer,
-        tokenizer_type=tokenizer_type,
-        unk_token="[UNK]",
-        pad_token="[PAD]",
-        word_delimiter_token="|",
+        tokenizer_name_or_path,
+        unk_token=unk_token,
+        pad_token=pad_token,
+        word_delimiter_token=word_delimiter_token,
        use_auth_token=data_args.use_auth_token,
+        **tokenizer_type_hints,
    )
    feature_extractor = AutoFeatureExtractor.from_pretrained(
        model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
    )
-    processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

    # adapt config
    config.update(
@@ -471,8 +530,8 @@ def main():
            "gradient_checkpointing": training_args.gradient_checkpointing,
            "layerdrop": model_args.layerdrop,
            "ctc_loss_reduction": model_args.ctc_loss_reduction,
-            "pad_token_id": processor.tokenizer.pad_token_id,
-            "vocab_size": len(processor.tokenizer),
+            "pad_token_id": tokenizer.pad_token_id,
+            "vocab_size": len(tokenizer),
            "activation_dropout": model_args.activation_dropout,
        }
    )
@@ -489,55 +548,64 @@ def main():
    if model_args.freeze_feature_extractor:
        model.freeze_feature_extractor()

-    # 5. Now we preprocess the datasets including loading the audio, resampling and normalization
+    # 6. Now we preprocess the datasets including loading the audio, resampling and normalization
    # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
    # so that we just need to set the correct target sampling rate and normalize the input
    # via the `feature_extractor`

    # make sure that dataset decodes audio with correct sampling rate
-    raw_datasets = raw_datasets.cast_column(
-        data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
-    )
+    dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
+    if dataset_sampling_rate != feature_extractor.sampling_rate:
+        raw_datasets = raw_datasets.cast_column(
+            data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
+        )

    # derive max & min input length for sample rate & max duration
-    max_input_length = data_args.max_duration_in_seconds * processor.feature_extractor.sampling_rate
-    min_input_length = data_args.min_duration_in_seconds * processor.feature_extractor.sampling_rate
+    max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
+    min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+
+    # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
+    phoneme_language = data_args.phoneme_language

    # Preprocessing the datasets.
    # We need to read the audio files as arrays and tokenize the targets.
    def prepare_dataset(batch):
        # load audio
-        sample = batch[data_args.audio_column_name]
+        sample = batch[audio_column_name]

-        batch["input_values"] = processor(
-            sample["array"], sampling_rate=sample["sampling_rate"], truncate=True, max_length=max_input_length
-        ).input_values[0]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch["input_values"] = inputs.input_values[0]
        batch["input_length"] = len(batch["input_values"])

-        # Setup the processor for targets
-        with processor.as_target_processor():
-            batch["labels"] = processor(batch["target_text"]).input_ids
+        # encode targets
+        additional_kwargs = {}
+        if phoneme_language is not None:
+            additional_kwargs["phonemizer_lang"] = phoneme_language
+
+        batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
        return batch

    with training_args.main_process_first(desc="dataset map preprocessing"):
        vectorized_datasets = raw_datasets.map(
            prepare_dataset,
-            remove_columns=raw_datasets["train"].column_names,
-            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=next(iter(raw_datasets.values())).column_names,
+            num_proc=num_workers,
            desc="preprocess datasets",
        )

-        if min_input_length > 0.0:
-            # filter data that is shorter than min_input_length
-            vectorized_datasets = vectorized_datasets.filter(
-                lambda x: x > min_input_length,
-                num_proc=data_args.preprocessing_num_workers,
-                input_columns=["input_length"],
-            )
+        def is_audio_in_length_range(length):
+            return length > min_input_length and length < max_input_length

-        vectorized_datasets = vectorized_datasets.remove_columns("input_length")
+        # filter data that is shorter than min_input_length
+        vectorized_datasets = vectorized_datasets.filter(
+            is_audio_in_length_range,
+            num_proc=num_workers,
+            input_columns=["input_length"],
+        )

-    # 6. Next, we can prepare the training.
+    # 7. Next, we can prepare the training.
    # Let's use word error rate (WER) as our evaluation metric,
    # instantiate a data collator and the trainer

@@ -557,16 +625,36 @@ def main():
        pred_logits = pred.predictions
        pred_ids = np.argmax(pred_logits, axis=-1)

-        pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
+        pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id

-        pred_str = processor.batch_decode(pred_ids)
+        pred_str = tokenizer.batch_decode(pred_ids)
        # we do not want to group tokens when computing the metrics
-        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
+        label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)

        metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}

        return metrics

+    # Now create a single processor
+    if is_main_process(training_args.local_rank):
+        # save feature extractor, tokenizer and config
+        feature_extractor.save_pretrained(training_args.output_dir)
+        tokenizer.save_pretrained(training_args.output_dir)
+        config.save_pretrained(training_args.output_dir)
+
+    # load processor
+    try:
+        processor = AutoProcessor.from_pretrained(training_args.output_dir)
+    except (OSError, KeyError):
+        warnings.warn(
+            "Loading a processor from a feature extractor config that does not"
+            " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
+            " attribute to your `preprocessor_config.json` file to suppress this warning: "
+            " `'processor_class': 'Wav2Vec2Processor'`",
+            FutureWarning,
+        )
+        processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
+
    # Instantiate custom data collator
    data_collator = DataCollatorCTCWithPadding(processor=processor)

@@ -578,10 +666,10 @@ def main():
        compute_metrics=compute_metrics,
        train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
        eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
-        tokenizer=processor.feature_extractor,
+        tokenizer=feature_extractor,
    )

-    # 7. Finally, we can start training
+    # 8. Finally, we can start training

    # Training
    if training_args.do_train:
@@ -594,10 +682,6 @@ def main():
        else:
            checkpoint = None

-        # Save the feature_extractor and the tokenizer
-        if is_main_process(training_args.local_rank):
-            processor.save_pretrained(training_args.output_dir)
-
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()