Wav2Vec2 meets phonemes (#14353)

* up

* add tokenizer

* improve more

* finish tokenizer

* finish

* adapt speech recognition script

* adapt convert

* more fixes

* more fixes

* update phonemizer wav2vec2

* better naming

* fix more tests

* more fixes swedish

* correct tests

* finish

* improve script

* remove file

* up

* lets get those 100 model architectures until the end of the month

* make fix-copies

* correct more

* correct script

* more fixes

* more fixes

* add to docs

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* replace assert

* fix copies

* fix docs

* new try docs

* boom boom

* update

* add phonemizer to audio tests

* make fix-copies

* up

* upload models

* some changes

* Update tests/test_tokenization_wav2vec2_phoneme.py

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* more fixes

* remove @

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>
This commit is contained in:
Patrick von Platen
2021-12-17 19:56:44 +01:00
committed by GitHub
parent 77d6c826d8
commit c4a96cecbc
26 changed files with 1296 additions and 151 deletions

View File

@@ -21,6 +21,7 @@ import logging
import os
import re
import sys
import warnings
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Union
@@ -34,6 +35,7 @@ from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoModelForCTC,
AutoProcessor,
AutoTokenizer,
HfArgumentParser,
Trainer,
@@ -68,6 +70,10 @@ class ModelArguments:
model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
tokenizer_name_or_path: Optional[str] = field(
default=None,
metadata={"help": "Path to pretrained tokenizer or tokenizer identifier from huggingface.co/models"},
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
@@ -191,7 +197,7 @@ class DataTrainingArguments:
max_duration_in_seconds: Optional[float] = field(
default=20.0,
metadata={
"help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
},
)
min_duration_in_seconds: Optional[float] = field(
@@ -210,7 +216,28 @@ class DataTrainingArguments:
default=False,
metadata={
"help": "If :obj:`True`, will use the token generated when running"
":obj:`transformers-cli logiin as HTTP bearer authorization for remote files."
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
},
)
unk_token: Optional[str] = field(
default="[UNK]",
metadata={"help": "The unk token for the tokenizer"},
)
pad_token: Optional[str] = field(
default="[PAD]",
metadata={"help": "The padding token for the tokenizer"},
)
word_delimiter_token: Optional[str] = field(
default="|",
metadata={"help": "The word delimiter token for the tokenizer"},
)
phoneme_language: Optional[str] = field(
default=None,
metadata={
"help": "The target language that should be used be"
" passed to the tokenizer for tokenization. Note that"
" this is only relevant if the model classifies the"
" input audio to a sequence of phoneme sequences."
},
)
@@ -220,7 +247,7 @@ class DataCollatorCTCWithPadding:
"""
Data collator that will dynamically pad the inputs received.
Args:
processor (:class:`~transformers.Wav2Vec2Processor`)
processor (:class:`~transformers.AutoProcessor`)
The processor used for proccessing the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
@@ -241,7 +268,7 @@ class DataCollatorCTCWithPadding:
7.5 (Volta).
"""
processor: Wav2Vec2Processor
processor: AutoProcessor
padding: Union[bool, str] = "longest"
pad_to_multiple_of: Optional[int] = None
pad_to_multiple_of_labels: Optional[int] = None
@@ -275,7 +302,12 @@ class DataCollatorCTCWithPadding:
return batch
def create_vocabulary_from_data(datasets: DatasetDict):
def create_vocabulary_from_data(
datasets: DatasetDict,
word_delimiter_token: Optional[str] = None,
unk_token: Optional[str] = None,
pad_token: Optional[str] = None,
):
# Given training and test labels create vocabulary
def extract_all_chars(batch):
all_text = " ".join(batch["target_text"])
@@ -298,12 +330,16 @@ def create_vocabulary_from_data(datasets: DatasetDict):
vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
# replace white space with delimiter token
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
if word_delimiter_token is not None:
vocab_dict[word_delimiter_token] = vocab_dict[" "]
del vocab_dict[" "]
# add unk and pad token
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
if unk_token is not None:
vocab_dict[unk_token] = len(vocab_dict)
if pad_token is not None:
vocab_dict[pad_token] = len(vocab_dict)
return vocab_dict
@@ -359,103 +395,126 @@ def main():
# 1. First, let's load the dataset
raw_datasets = DatasetDict()
raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
)
raw_datasets["eval"] = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name
)
if data_args.audio_column_name not in raw_datasets["train"].column_names:
raise ValueError(
f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--audio_column_name` to the correct audio column - one of "
f"{', '.join(raw_datasets['train'].column_names)}."
if training_args.do_train:
raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
)
if data_args.text_column_name not in raw_datasets["train"].column_names:
raise ValueError(
f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--text_column_name` to the correct text column - one of "
f"{', '.join(raw_datasets['train'].column_names)}."
if data_args.audio_column_name not in raw_datasets["train"].column_names:
raise ValueError(
f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--audio_column_name` to the correct audio column - one of "
f"{', '.join(raw_datasets['train'].column_names)}."
)
if data_args.text_column_name not in raw_datasets["train"].column_names:
raise ValueError(
f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--text_column_name` to the correct text column - one of "
f"{', '.join(raw_datasets['train'].column_names)}."
)
if data_args.max_train_samples is not None:
raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
if training_args.do_eval:
raw_datasets["eval"] = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name
)
# prepare dataset
if data_args.max_train_samples is not None:
raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
if data_args.max_eval_samples is not None:
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
if data_args.max_eval_samples is not None:
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
# 2. We remove some special characters from the datasets
# that make training complicated and do not help in transcribing the speech
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
# that could be easily picked up by the model
chars_to_ignore_regex = (
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
)
text_column_name = data_args.text_column_name
def remove_special_characters(batch):
if chars_to_ignore_regex is not None:
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[data_args.text_column_name]).lower() + " "
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).lower() + " "
else:
batch["target_text"] = batch[data_args.text_column_name].lower() + " "
batch["target_text"] = batch[text_column_name].lower() + " "
return batch
with training_args.main_process_first(desc="dataset map special characters removal"):
raw_datasets = raw_datasets.map(
remove_special_characters,
remove_columns=[data_args.text_column_name],
remove_columns=[text_column_name],
desc="remove special characters from datasets",
)
# 3. Next, we create the vocabulary of the model by extracting all unique characters from
# the training and evaluation datasets
# We need to make sure that only first rank saves vocabulary
# make sure all processes wait until vocab is created
vocab_file = os.path.join(training_args.output_dir, "vocab.json")
with training_args.main_process_first():
if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
os.remove(vocab_file)
with training_args.main_process_first(desc="dataset map vocabulary creation"):
if not os.path.isfile(vocab_file):
os.makedirs(training_args.output_dir, exist_ok=True)
vocab_dict = create_vocabulary_from_data(raw_datasets)
# save vocab dict to be loaded into tokenizer
with open(vocab_file, "w") as file:
json.dump(vocab_dict, file)
# 4. Now we can instantiate the configuration, feature extractor, tokenizer and model
# Note for distributed training, the .from_pretrained methods guarantee that only
# one local process can concurrently download model & vocab.
# save special tokens for tokenizer
word_delimiter_token = data_args.word_delimiter_token
unk_token = data_args.unk_token
pad_token = data_args.pad_token
# 3. Next, let's load the config as we might need it to create
# the tokenizer
# load config
config = AutoConfig.from_pretrained(
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
)
# tokenizer is defined by `tokenizer_class` if present in config else by `model_type`
config_for_tokenizer = config if config.tokenizer_class is not None else None
tokenizer_type = config.model_type if config.tokenizer_class is None else None
# 4. Next, if no tokenizer file is defined,
# we create the vocabulary of the model by extracting all unique characters from
# the training and evaluation datasets
# We need to make sure that only first rank saves vocabulary
# make sure all processes wait until vocab is created
tokenizer_name_or_path = model_args.tokenizer_name_or_path
tokenizer_type_hints = {}
if tokenizer_name_or_path is None:
# save vocab in training output dir
tokenizer_name_or_path = training_args.output_dir
# load feature_extractor, tokenizer and create processor
vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json")
with training_args.main_process_first():
if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
os.remove(vocab_file)
with training_args.main_process_first(desc="dataset map vocabulary creation"):
if not os.path.isfile(vocab_file):
os.makedirs(tokenizer_name_or_path, exist_ok=True)
vocab_dict = create_vocabulary_from_data(
raw_datasets,
word_delimiter_token=word_delimiter_token,
unk_token=unk_token,
pad_token=pad_token,
)
# save vocab dict to be loaded into tokenizer
with open(vocab_file, "w") as file:
json.dump(vocab_dict, file)
# if tokenizer has just been created
# it is defined by `tokenizer_class` if present in config else by `model_type`
tokenizer_type_hints = {
"config": config if config.tokenizer_class is not None else None,
"tokenizer_type": config.model_type if config.tokenizer_class is None else None,
}
# 5. Now we can instantiate the feature extractor, tokenizer and model
# Note for distributed training, the .from_pretrained methods guarantee that only
# one local process can concurrently download model & vocab.
# load feature_extractor and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
training_args.output_dir,
config=config_for_tokenizer,
tokenizer_type=tokenizer_type,
unk_token="[UNK]",
pad_token="[PAD]",
word_delimiter_token="|",
tokenizer_name_or_path,
unk_token=unk_token,
pad_token=pad_token,
word_delimiter_token=word_delimiter_token,
use_auth_token=data_args.use_auth_token,
**tokenizer_type_hints,
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token
)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# adapt config
config.update(
@@ -471,8 +530,8 @@ def main():
"gradient_checkpointing": training_args.gradient_checkpointing,
"layerdrop": model_args.layerdrop,
"ctc_loss_reduction": model_args.ctc_loss_reduction,
"pad_token_id": processor.tokenizer.pad_token_id,
"vocab_size": len(processor.tokenizer),
"pad_token_id": tokenizer.pad_token_id,
"vocab_size": len(tokenizer),
"activation_dropout": model_args.activation_dropout,
}
)
@@ -489,55 +548,64 @@ def main():
if model_args.freeze_feature_extractor:
model.freeze_feature_extractor()
# 5. Now we preprocess the datasets including loading the audio, resampling and normalization
# 6. Now we preprocess the datasets including loading the audio, resampling and normalization
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
# so that we just need to set the correct target sampling rate and normalize the input
# via the `feature_extractor`
# make sure that dataset decodes audio with correct sampling rate
raw_datasets = raw_datasets.cast_column(
data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
)
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
if dataset_sampling_rate != feature_extractor.sampling_rate:
raw_datasets = raw_datasets.cast_column(
data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
)
# derive max & min input length for sample rate & max duration
max_input_length = data_args.max_duration_in_seconds * processor.feature_extractor.sampling_rate
min_input_length = data_args.min_duration_in_seconds * processor.feature_extractor.sampling_rate
max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
audio_column_name = data_args.audio_column_name
num_workers = data_args.preprocessing_num_workers
# `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification
phoneme_language = data_args.phoneme_language
# Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets.
def prepare_dataset(batch):
# load audio
sample = batch[data_args.audio_column_name]
sample = batch[audio_column_name]
batch["input_values"] = processor(
sample["array"], sampling_rate=sample["sampling_rate"], truncate=True, max_length=max_input_length
).input_values[0]
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
batch["input_values"] = inputs.input_values[0]
batch["input_length"] = len(batch["input_values"])
# Setup the processor for targets
with processor.as_target_processor():
batch["labels"] = processor(batch["target_text"]).input_ids
# encode targets
additional_kwargs = {}
if phoneme_language is not None:
additional_kwargs["phonemizer_lang"] = phoneme_language
batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
return batch
with training_args.main_process_first(desc="dataset map preprocessing"):
vectorized_datasets = raw_datasets.map(
prepare_dataset,
remove_columns=raw_datasets["train"].column_names,
num_proc=data_args.preprocessing_num_workers,
remove_columns=next(iter(raw_datasets.values())).column_names,
num_proc=num_workers,
desc="preprocess datasets",
)
if min_input_length > 0.0:
# filter data that is shorter than min_input_length
vectorized_datasets = vectorized_datasets.filter(
lambda x: x > min_input_length,
num_proc=data_args.preprocessing_num_workers,
input_columns=["input_length"],
)
def is_audio_in_length_range(length):
return length > min_input_length and length < max_input_length
vectorized_datasets = vectorized_datasets.remove_columns("input_length")
# filter data that is shorter than min_input_length
vectorized_datasets = vectorized_datasets.filter(
is_audio_in_length_range,
num_proc=num_workers,
input_columns=["input_length"],
)
# 6. Next, we can prepare the training.
# 7. Next, we can prepare the training.
# Let's use word error rate (WER) as our evaluation metric,
# instantiate a data collator and the trainer
@@ -557,16 +625,36 @@ def main():
pred_logits = pred.predictions
pred_ids = np.argmax(pred_logits, axis=-1)
pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id
pred_str = processor.batch_decode(pred_ids)
pred_str = tokenizer.batch_decode(pred_ids)
# we do not want to group tokens when computing the metrics
label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)
metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}
return metrics
# Now create a single processor
if is_main_process(training_args.local_rank):
# save feature extractor, tokenizer and config
feature_extractor.save_pretrained(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)
config.save_pretrained(training_args.output_dir)
# load processor
try:
processor = AutoProcessor.from_pretrained(training_args.output_dir)
except (OSError, KeyError):
warnings.warn(
"Loading a processor from a feature extractor config that does not"
" include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following "
" attribute to your `preprocessor_config.json` file to suppress this warning: "
" `'processor_class': 'Wav2Vec2Processor'`",
FutureWarning,
)
processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir)
# Instantiate custom data collator
data_collator = DataCollatorCTCWithPadding(processor=processor)
@@ -578,10 +666,10 @@ def main():
compute_metrics=compute_metrics,
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
tokenizer=processor.feature_extractor,
tokenizer=feature_extractor,
)
# 7. Finally, we can start training
# 8. Finally, we can start training
# Training
if training_args.do_train:
@@ -594,10 +682,6 @@ def main():
else:
checkpoint = None
# Save the feature_extractor and the tokenizer
if is_main_process(training_args.local_rank):
processor.save_pretrained(training_args.output_dir)
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()