From d72343d2b804d0304d93bac1c1b58e0dafd5e820 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 10 Jan 2022 10:46:21 +0100 Subject: [PATCH] [Wav2Vec2 Speech Event] Add speech event v2 (#15083) * up * up * up * up * up * up * improve * up * up * Update src/transformers/trainer.py * up * up * up --- examples/pytorch/speech-recognition/README.md | 2 + .../speech-recognition/requirements.txt | 1 + .../run_speech_recognition_ctc.py | 10 +- examples/research_projects/wav2vec2/README.md | 6 +- examples/research_projects/xls_r/README.md | 395 ++++++++++++++++++ 5 files changed, 409 insertions(+), 5 deletions(-) create mode 100644 examples/research_projects/xls_r/README.md diff --git a/examples/pytorch/speech-recognition/README.md b/examples/pytorch/speech-recognition/README.md index 8eeb8efc87..7db7811ba2 100644 --- a/examples/pytorch/speech-recognition/README.md +++ b/examples/pytorch/speech-recognition/README.md @@ -74,6 +74,7 @@ python run_speech_recognition_ctc.py \ --warmup_steps="500" \ --evaluation_strategy="steps" \ --text_column_name="sentence" \ + --length_column_name="input_length" \ --save_steps="400" \ --eval_steps="100" \ --layerdrop="0.0" \ @@ -108,6 +109,7 @@ python -m torch.distributed.launch \ --warmup_steps="500" \ --evaluation_strategy="steps" \ --text_column_name="sentence" \ + --length_column_name="input_length" \ --save_steps="400" \ --eval_steps="100" \ --logging_steps="1" \ diff --git a/examples/pytorch/speech-recognition/requirements.txt b/examples/pytorch/speech-recognition/requirements.txt index b5c58171d3..b8c475e40a 100644 --- a/examples/pytorch/speech-recognition/requirements.txt +++ b/examples/pytorch/speech-recognition/requirements.txt @@ -2,3 +2,4 @@ datasets >= 1.13.3 torch >= 1.5 torchaudio librosa +jiwer diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 1784b613fa..0c460c3f02 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -396,7 +396,10 @@ def main(): if training_args.do_train: raw_datasets["train"] = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name + data_args.dataset_name, + data_args.dataset_config_name, + split=data_args.train_split_name, + use_auth_token=data_args.use_auth_token, ) if data_args.audio_column_name not in raw_datasets["train"].column_names: @@ -418,7 +421,10 @@ def main(): if training_args.do_eval: raw_datasets["eval"] = load_dataset( - data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name + data_args.dataset_name, + data_args.dataset_config_name, + split=data_args.eval_split_name, + use_auth_token=data_args.use_auth_token, ) if data_args.max_eval_samples is not None: diff --git a/examples/research_projects/wav2vec2/README.md b/examples/research_projects/wav2vec2/README.md index 235284840c..9561eb252a 100644 --- a/examples/research_projects/wav2vec2/README.md +++ b/examples/research_projects/wav2vec2/README.md @@ -1,12 +1,12 @@ +**NOTE**: This example is outdated and is not longer actively maintained. Please +follow the new instructions of fine-tuning Wav2Vec2 [here](https://github.com/huggingface/transformers/blob/master/examples/pytorch/speech-recognition/README.md) + ## Fine-tuning Wav2Vec2 The `run_asr.py` script allows one to fine-tune pretrained Wav2Vec2 models that can be found [here](https://huggingface.co/models?search=facebook/wav2vec2). This finetuning script can also be run as a google colab [TODO: here]( ). -The script is actively maintained by [Patrick von Platen](https://github.com/patrickvonplaten). -Feel free to ask a question on the [Forum](https://discuss.huggingface.co/) or post an issue on [GitHub](https://github.com/huggingface/transformers/issues/new/choose) and adding `@patrickvonplaten` as a tag. - ### Fine-Tuning with TIMIT Let's take a look at the [script](./finetune_base_timit_asr.sh) used to fine-tune [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) with the [TIMIT dataset](https://huggingface.co/datasets/timit_asr): diff --git a/examples/research_projects/xls_r/README.md b/examples/research_projects/xls_r/README.md new file mode 100644 index 0000000000..7c1cc5cc21 --- /dev/null +++ b/examples/research_projects/xls_r/README.md @@ -0,0 +1,395 @@ +# Speech recognition community week - version 2 πŸ€— + +Welcome to the 2nd version of the speech recognition community eventπŸŽ™οΈ ! +The goal of this event is to build **robust**, **real-world** speech recognition (ASR) models in as many languages as possible🌏🌍🌎. + +If necessary and available, free access to a V100 32 GB GPU will kindly be provided by the [OVH team](https://us.ovhcloud.com/) πŸš€. + +This document summarizes all the relevant information required for the speech community eventπŸ“‹. + +Don't forget to sign up [here](TODO: Create google from)πŸ€—. + +## Table of Contents + +- [Organization](#organization) +- [Important dates](#important-dates) +- [How to install pytorch, transformers, datasets](#how-to-install-relevant-libraries) +- [How to fine-tune a speech recognition model](#how-to-finetune-a-model) +- [Talks](#talks) +- [Project evaluation](#project-evaluation) +- [General Tips & Tricks](#general-tips-and-tricks) +- [FAQ](#faq) + +## Organization + +Participants are encouraged to leverage pre-trained speech recognition checkpoints, +preferably [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53), to train a speech recognition system in a language of their +choice. + +Participants can make use of whatever data they think is useful to build a +**robust** speech recognition system for **real-world** audio data. We strongly +recommend making use of [Mozilla's diverse Common Voice dataset](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0) when training the model. +Please do **not** use the `"test"` split of the Common Voice datasets for training +as we will likely use this split for the final evaluation of your model. +We kindly ask you to make sure that the dataset that you are using for training +has the appropriate licensing - see [here](TODO: ) for more information. + +During the event, the fine-tuned models will regularly be tested on a **development +dataset** provided by the Hugging Face team and at the end of the event, all models +will be tested on a **test dataset**. For each language, +the best performing model will receive a prize πŸ† - more information regarding +the testing [here](TODO: ) and prizes [here](TODO: ). We believe that framing the +event as a competition is more fun, but at the core, we strongly encourage +participants to work together by helping each other to solve bugs, share important findings, etc...πŸ€— + +If possible it is encouraged to fine-tune the models on local GPU machines, but +if those are not available, the OVH cloud team kindly provides a limited +number of GPUs for the event. For more information on how to get access to the GPU - see [here](TODO: ). + + +**Please note**: +All important announcements will be made on discord. Please make sure that +you've joined the following discord server: TODO: fill out. +Please make sure that you have been added to the [Speech Event Organization](https://huggingface.co/speech-recognition-community-v2). You should have received an +invite by email. If you didn't receive an invite, please contact the organizers, *e.g.* Anton, Patrick, or Omar on discord. + + +## Important dates + +- **12.01.** Official announcement of the community event. Make sure to sign-up in [this google form](TODO: ) +- **12.01. - 19.01.** Participants sign up for the event. +- **19.01.** Release of all relevant guides and training scripts. +- **24.01.** Start of the community week! OVH & Hugging Face gives access to GPUs. +- **24.01. - 07.02.** The OVH & Hugging Face team will be available for any questions, problems the participants might have. +- **07.02.** Access to GPU is deactivated and community week officially ends. + +## How to install relevant libraries + +The following libraries are required to fine-tune a speech model with πŸ€— Transformers and πŸ€— Datasets in PyTorch. + +- [PyTorch](https://pytorch.org/) +- [Transformers](https://github.com/huggingface/transformers) +- [Datasets](https://github.com/huggingface/datasets) + +We recommend installing the above libraries in a [virtual environment](https://docs.python.org/3/library/venv.html). +If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going +to use and activate it. + +You should be able to run the command: + +```bash +python3 -m venv +``` + +You can activate your venv by running + +```bash +source ~//bin/activate +``` + +To begin with please make sure you have PyTorch and CUDA correctly installed. +The following command should return ``True``: + +```bash +python -c "import torch; print(torch.cuda.is_available())" +``` + +If the above command doesn't print ``True``, in a first step, please follow the +instructions [here](https://pytorch.org/) to install PyTorch with CUDA. + +We strongly recommend making use of the provided PyTorch examples scripts in [transformers/examples/pytorch/speech-recognition](https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition) to train your speech recognition +system. +In all likelihood, you will adjust one of the example scripts, so we recommend forking and cloning the πŸ€— Transformers repository as follows. + +1. Fork the [repository](https://github.com/huggingface/transformers) by + clicking on the 'Fork' button on the repository's page. This creates a copy of the code + under your GitHub user account. + +2. Clone your fork to your local disk, and add the base repository as a remote: + + ```bash + $ git clone https://github.com//transformers.git + $ cd transformers + $ git remote add upstream https://github.com/huggingface/transformers.git + ``` + +3. Create a new branch to hold your development changes. This is especially useful to share code changes with your team: + + ```bash + $ git checkout -b a-descriptive-name-for-my-project + ``` + +4. Set up a PyTorch environment by running the following command your virtual environment: + + ```bash + $ pip install -e ".[torch-speech]" + ``` + + (If transformers was already installed in the virtual environment, remove + it with `pip uninstall transformers` before reinstalling it in editable + mode with the `-e` flag.) + + If you have already cloned that repo, you might need to `git pull` to get the most recent changes in the `transformers` + library. + + Running this command will automatically install `pytorch` and the most relevant + libraries required for fine-tuning a speech recognition system. + +Next, you should also install the πŸ€— Datasets library. We strongly recommend installing the +library from source to profit from the most current additions during the community week. + +Simply run the following steps: + +``` +$ cd ~/ +$ git clone https://github.com/huggingface/datasets.git +$ cd datasets +$ pip install -e ".[streaming]" +``` + +If you plan on contributing a specific dataset during +the community week, please fork the datasets repository and follow the instructions +[here](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-create-a-pull-request). + +To verify that all libraries are correctly installed, you can run the following command in a Python shell. +It verifies that both `transformers` and `datasets` have been correclty installed. + +```python +from transformers import AutoModelForCTC, AutoProcessor +from datasets import load_dataset + +dummy_dataset = load_dataset("common_voice", "ab", split="test") + +model = AutoModelForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2") +model.to("cuda") + +processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2") + +input_values = processor(dummy_dataset[0]["audio"]["array"], return_tensors="pt", sampling_rate=16_000).input_values +input_values = input_values.to("cuda") + +logits = model(input_values).logits + +assert logits.shape[-1] == 32 +``` + +## How to finetune a model + +In this section, we show you how to fine-tune a pre-trained [XLS-R Model](https://huggingface.co/docs/transformers/model_doc/xls_r) on the [Common Voice 7 dataset](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0). + +We recommend fine-tuning one of the following pre-trained XLS-R checkpoints: + +- [300M parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-300m) +- [1B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-1b) +- [2B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-2b) + +To begin with, please note that to use the Common Voice dataset, you +have to accept that **your email address** and **username** are shared with the +mozilla-foundation. To get access to the dataset please click on "*Access repository*" [here](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0). + +Next, we recommended that you get familiar with the XLS-R model and its capabilities. +In collaboration with [Fairseq's Wav2Vec2 team](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec), +we've written ["Fine-tuning XLS-R for Multi-Lingual ASR with πŸ€— Transformers"](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) which gives an in-detail explanation of how XLS-R functions and how it can be fine-tuned. + +The blog can also be opened and directly fine-tuned in a google colab notebook. +In this section, we will explain how to fine-tune the model on a local machine. + +1. **Log in** + +To begin with you should check that you are correctly logged in and that you have `git-lfs` installed so that your fine-tuned model can automatically be uploaded. + +Run: + +```bash +huggingface-cli login +``` + +to login. It is recommend to login with your personal access token that can be found under your hugging face profile (icon in the top right corner on [hf.co](http://hf.co/), then Settings -> Access Tokens -> User Access Tokens -> New Token (if haven't generated one already) + +You can then copy-paste this token to log in locally. + +2. **Create your model repository** + +First, let's make sure that `git-lfs` is correctly installed. To so, simply run: + +```bash +git-lfs -v +``` + +The output should show something like `git-lfs/2.13.2 (GitHub; linux amd64; go 1.15.4)`. If your console states that the `git-lfs` command was not found, please make +sure to install it [here](https://git-lfs.github.com/) or simply via: + +```bash +sudo apt-get install git-lfs +``` + +Now you can create your model repository which will contain all relevant files to +reproduce your training. You can either directly create the model repository on the +Hub (Settings -> New Model) or via the CLI. Here we choose to use the CLI instead. + +Assuming that we want to call our model repository *xls-r-ab-test*, we can run the +following command: + +```bash +huggingface-cli repo create xls-r-ab-test +``` + +You can now see the model on the Hub, *e.g.* under https://huggingface.co/hf-test/xls-r-ab-test . + +Let's clone the repository so that we can define our training script inside. + +```bash +git lfs install +git clone https://huggingface.co/hf-test/xls-r-ab-test +``` + +3. **Add your training script and `run`-command to the repository** + +We encourage participants to add all relevant files for training directly to the +directory so that everything is fully reproducible. + +Let's first copy-paste the official training script from our clone +of `transformers` to our just created directory: + +```bash +cp ~/transformers/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py ./ +``` + +Next, we'll create a bash file to define the hyper-parameters and configurations +for training. More detailed information on different settings (single-GPU vs. multi-GPU) can be found [here](https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition#connectionist-temporal-classification). + +For demonstration purposes, we will use a dummy XLS-R model `model_name_or_path="hf-test/xls-r-dummy"` on the very low-resource language of "Abkhaz" of [Common Voice 7](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0): `dataset_config_name="ab"` for just a single epoch. + +Before starting to train, let's make sure we have installed all the required libraries. You might want to run: + +```bash +pip install -r ~/transformers/examples/pytorch/speech-recognition/requirements.txt +``` + +Alright, finally we can define the training script. We'll simply use some +dummy hyper-parameters and configurations for demonstration purposes. + +Note that we add the flag `--use_auth_token` so that datasets requiring access, +such as [Common Voice 7](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0) can be downloaded. In addition, we add the `--push_to_hub` flag to make use of the +[Trainers `push_to-hub` functionality](https://huggingface.co/docs/transformers/master/en/main_classes/trainer#transformers.Trainer.push_to_hub) so that your model will be automatically uploaded to the Hub. + +Let's copy the following code snippet in a file called `run.sh` + +```bash +echo '''python run_speech_recognition_ctc.py \ + --dataset_name="mozilla-foundation/common_voice_7_0" \ + --model_name_or_path="hf-test/xls-r-dummy" \ + --dataset_config_name="ab" \ + --output_dir="./" \ + --overwrite_output_dir \ + --max_steps="10" \ + --per_device_train_batch_size="2" \ + --learning_rate="3e-4" \ + --save_total_limit="1" \ + --evaluation_strategy="steps" \ + --text_column_name="sentence" \ + --length_column_name="input_length" \ + --save_steps="5" \ + --layerdrop="0.0" \ + --freeze_feature_encoder \ + --gradient_checkpointing \ + --fp16 \ + --group_by_length \ + --push_to_hub \ + --use_auth_token \ + --do_train --do_eval''' > run.sh +``` + +4. **Start training** + +Now all that is left to do is to start training the model by executing the +run file. + +```bash +bash run.sh +``` + +The training should not take more than a couple of minutes. +During the training intermediate saved checkpoints are automatically uploaded to +your model repository as can be seen [on this commit](https://huggingface.co/hf-test/xls-r-ab-test/commit/0eb19a0fca4d7d163997b59663d98cd856022aa6) . + +At the end of the training, the [Trainer](https://huggingface.co/docs/transformers/master/en/main_classes/trainer) automatically creates a nice model card and all +relevant files are uploaded. + +5. **Tips for real model training** + +The above steps illustrate how a model can technically be fine-tuned. +However as you can see on the model card [hf-test/xls-r-ab-test](https://huggingface.co/hf-test/xls-r-ab-test), our demonstration has a very poor performance which is +not surprising given that we trained for just 10 steps on a randomly initialized +model. + +For a real model training, one of the actual pre-trained XLS-R models should be used: + +- [300M parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-300m) +- [1B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-1b) +- [2B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-2b) + +Also, the hyper-parameters should be carefully chosen depending on the dataset. +As an example, we will fine-tune the 300M parameters model on Swedish on a single +TITAN RTX 24GB GPU. + +The model will be called `"xls-r-300m-sv"`. +Following the above steps we first create the model: + +```bash +huggingface-cli repo create xls-r-300m-sv +``` + +and then clone it locally: + +```bash + + +and we define the following + +hyperparameters for training + +```bash + +echo '''python run_speech_recognition_ctc.py \ + --dataset_name="mozilla-foundation/common_voice_7_0" \ + --model_name_or_path="facebook/wav2vec2-xls-r-300m" \ + --dataset_config_name="sv-SE" \ + --output_dir="./" \ + --overwrite_output_dir \ + --num_train_epochs="50" \ + --per_device_train_batch_size="8" \ + --per_device_eval_batch_size="8" \ + --gradient_accumulation_steps="4" \ + --learning_rate="7.5e-5" \ + --warmup_steps="2000" \ + --length_column_name="input_length" \ + --evaluation_strategy="steps" \ + --text_column_name="sentence" \ + --chars_to_ignore , ? . ! \- \; \: \" β€œ % β€˜ ” οΏ½ β€” ’ … – \ + --save_steps="500" \ + --eval_steps="500" \ + --logging_steps="100" \ + --layerdrop="0.0" \ + --activation_dropout="0.1" \ + --save_total_limit="3" \ + --freeze_feature_encoder \ + --feat_proj_dropout="0.0" \ + --mask_time_prob="0.75" \ + --mask_time_length="10" \ + --mask_feature_prob="0.25" \ + --mask_feature_length="64" \ + --gradient_checkpointing \ + --use_auth_token \ + --fp16 \ + --group_by_length \ + --do_train --do_eval \ + --push_to_hub''' > run.sh +``` + +The training takes *ca.* 7 hours and yields a reasonable test word +error rate of 27% as can be seen on the automatically generated [model card](https://huggingface.co/hf-test/xls-r-300m-sv). + +The above-chosen hyperparameters probably work quite well on a range of different +datasets and languages, but are by no means optimal. It is up to you to find a good set of +hyperparameters.