From d72343d2b804d0304d93bac1c1b58e0dafd5e820 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 10 Jan 2022 10:46:21 +0100
Subject: [PATCH] [Wav2Vec2 Speech Event] Add speech event v2 (#15083)

* up

* up

* up

* up

* up

* up

* improve

* up

* up

* Update src/transformers/trainer.py

* up

* up

* up
---
 examples/pytorch/speech-recognition/README.md |   2 +
 .../speech-recognition/requirements.txt       |   1 +
 .../run_speech_recognition_ctc.py             |  10 +-
 examples/research_projects/wav2vec2/README.md |   6 +-
 examples/research_projects/xls_r/README.md    | 395 ++++++++++++++++++
 5 files changed, 409 insertions(+), 5 deletions(-)
 create mode 100644 examples/research_projects/xls_r/README.md

diff --git a/examples/pytorch/speech-recognition/README.md b/examples/pytorch/speech-recognition/README.md
index 8eeb8efc87..7db7811ba2 100644
--- a/examples/pytorch/speech-recognition/README.md
+++ b/examples/pytorch/speech-recognition/README.md
@@ -74,6 +74,7 @@ python run_speech_recognition_ctc.py \
 	--warmup_steps="500" \
 	--evaluation_strategy="steps" \
 	--text_column_name="sentence" \
+	--length_column_name="input_length" \
 	--save_steps="400" \
 	--eval_steps="100" \
 	--layerdrop="0.0" \
@@ -108,6 +109,7 @@ python -m torch.distributed.launch \
 	--warmup_steps="500" \
 	--evaluation_strategy="steps" \
 	--text_column_name="sentence" \
+	--length_column_name="input_length" \
 	--save_steps="400" \
 	--eval_steps="100" \
 	--logging_steps="1" \
diff --git a/examples/pytorch/speech-recognition/requirements.txt b/examples/pytorch/speech-recognition/requirements.txt
index b5c58171d3..b8c475e40a 100644
--- a/examples/pytorch/speech-recognition/requirements.txt
+++ b/examples/pytorch/speech-recognition/requirements.txt
@@ -2,3 +2,4 @@ datasets >= 1.13.3
 torch >= 1.5
 torchaudio
 librosa
+jiwer
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 1784b613fa..0c460c3f02 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -396,7 +396,10 @@ def main():
 
     if training_args.do_train:
         raw_datasets["train"] = load_dataset(
-            data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.train_split_name,
+            use_auth_token=data_args.use_auth_token,
         )
 
         if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -418,7 +421,10 @@ def main():
 
     if training_args.do_eval:
         raw_datasets["eval"] = load_dataset(
-            data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name
+            data_args.dataset_name,
+            data_args.dataset_config_name,
+            split=data_args.eval_split_name,
+            use_auth_token=data_args.use_auth_token,
         )
 
         if data_args.max_eval_samples is not None:
diff --git a/examples/research_projects/wav2vec2/README.md b/examples/research_projects/wav2vec2/README.md
index 235284840c..9561eb252a 100644
--- a/examples/research_projects/wav2vec2/README.md
+++ b/examples/research_projects/wav2vec2/README.md
@@ -1,12 +1,12 @@
+**NOTE**: This example is outdated and is not longer actively maintained. Please 
+follow the new instructions of fine-tuning Wav2Vec2 [here](https://github.com/huggingface/transformers/blob/master/examples/pytorch/speech-recognition/README.md)
+
 ## Fine-tuning Wav2Vec2
 
 The `run_asr.py` script allows one to fine-tune pretrained Wav2Vec2 models that can be found [here](https://huggingface.co/models?search=facebook/wav2vec2).
 
 This finetuning script can also be run as a google colab [TODO: here]( ).
 
-The script is actively maintained by [Patrick von Platen](https://github.com/patrickvonplaten).
-Feel free to ask a question on the [Forum](https://discuss.huggingface.co/) or post an issue on [GitHub](https://github.com/huggingface/transformers/issues/new/choose) and adding `@patrickvonplaten` as a tag.
-
 ### Fine-Tuning with TIMIT
 Let's take a look at the [script](./finetune_base_timit_asr.sh) used to fine-tune [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base)
 with the [TIMIT dataset](https://huggingface.co/datasets/timit_asr):
diff --git a/examples/research_projects/xls_r/README.md b/examples/research_projects/xls_r/README.md
new file mode 100644
index 0000000000..7c1cc5cc21
--- /dev/null
+++ b/examples/research_projects/xls_r/README.md
@@ -0,0 +1,395 @@
+# Speech recognition community week - version 2 🤗
+
+Welcome to the 2nd version of the speech recognition community event🎙️ !
+The goal of this event is to build **robust**, **real-world** speech recognition (ASR) models in as many languages as possible🌏🌍🌎.
+
+If necessary and available, free access to a V100 32 GB GPU will kindly be provided by the [OVH team](https://us.ovhcloud.com/) 🚀.
+
+This document summarizes all the relevant information required for the speech community event📋.
+
+Don't forget to sign up [here](TODO: Create google from)🤗.
+
+## Table of Contents
+
+- [Organization](#organization)
+- [Important dates](#important-dates)
+- [How to install pytorch, transformers, datasets](#how-to-install-relevant-libraries)
+- [How to fine-tune a speech recognition model](#how-to-finetune-a-model)
+- [Talks](#talks)
+- [Project evaluation](#project-evaluation)
+- [General Tips & Tricks](#general-tips-and-tricks)
+- [FAQ](#faq)
+
+## Organization
+
+Participants are encouraged to leverage pre-trained speech recognition checkpoints,
+preferably [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53), to train a speech recognition system in a language of their 
+choice.
+
+Participants can make use of whatever data they think is useful to build a 
+**robust** speech recognition system for **real-world** audio data. We strongly 
+recommend making use of [Mozilla's diverse Common Voice dataset](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0) when training the model.
+Please do **not** use the `"test"` split of the Common Voice datasets for training
+as we will likely use this split for the final evaluation of your model.
+We kindly ask you to make sure that the dataset that you are using for training 
+has the appropriate licensing - see [here](TODO: ) for more information.
+
+During the event, the fine-tuned models will regularly be tested on a **development 
+dataset** provided by the Hugging Face team and at the end of the event, all models 
+will be tested on a **test dataset**. For each language, 
+the best performing model will receive a prize 🏆 - more information regarding 
+the testing [here](TODO: ) and prizes [here](TODO: ). We believe that framing the 
+event as a competition is more fun, but at the core, we strongly encourage 
+participants to work together by helping each other to solve bugs, share important findings, etc...🤗
+
+If possible it is encouraged to fine-tune the models on local GPU machines, but 
+if those are not available, the OVH cloud team kindly provides a limited 
+number of GPUs for the event. For more information on how to get access to the GPU - see [here](TODO: ).
+
+
+**Please note**:
+All important announcements will be made on discord. Please make sure that 
+you've joined the following discord server: TODO: fill out.
+Please make sure that you have been added to the [Speech Event Organization](https://huggingface.co/speech-recognition-community-v2). You should have received an 
+invite by email. If you didn't receive an invite, please contact the organizers, *e.g.* Anton, Patrick, or Omar on discord.
+
+
+## Important dates
+
+- **12.01.** Official announcement of the community event. Make sure to sign-up in [this google form](TODO: )
+- **12.01. - 19.01.** Participants sign up for the event.
+- **19.01.** Release of all relevant guides and training scripts.
+- **24.01.** Start of the community week! OVH & Hugging Face gives access to GPUs.
+- **24.01. - 07.02.** The OVH & Hugging Face team will be available for any questions, problems the participants might have.
+- **07.02.** Access to GPU is deactivated and community week officially ends.
+
+## How to install relevant libraries
+
+The following libraries are required to fine-tune a speech model with 🤗 Transformers and 🤗 Datasets in PyTorch.
+
+- [PyTorch](https://pytorch.org/)
+- [Transformers](https://github.com/huggingface/transformers)
+- [Datasets](https://github.com/huggingface/datasets)
+
+We recommend installing the above libraries in a [virtual environment](https://docs.python.org/3/library/venv.html). 
+If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going
+to use and activate it.
+
+You should be able to run the command:
+
+```bash
+python3 -m venv <your-venv-name>
+```
+
+You can activate your venv by running
+
+```bash
+source ~/<your-venv-name>/bin/activate
+```
+
+To begin with please make sure you have PyTorch and CUDA correctly installed. 
+The following command should return ``True``:
+
+```bash
+python -c "import torch; print(torch.cuda.is_available())"
+```
+
+If the above command doesn't print ``True``, in a first step, please follow the
+instructions [here](https://pytorch.org/) to install PyTorch with CUDA.
+
+We strongly recommend making use of the provided PyTorch examples scripts in [transformers/examples/pytorch/speech-recognition](https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition) to train your speech recognition
+system.
+In all likelihood, you will adjust one of the example scripts, so we recommend forking and cloning the 🤗 Transformers repository as follows. 
+
+1. Fork the [repository](https://github.com/huggingface/transformers) by
+   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
+   under your GitHub user account.
+
+2. Clone your fork to your local disk, and add the base repository as a remote:
+
+   ```bash
+   $ git clone https://github.com/<your Github handle>/transformers.git
+   $ cd transformers
+   $ git remote add upstream https://github.com/huggingface/transformers.git
+   ```
+
+3. Create a new branch to hold your development changes. This is especially useful to share code changes with your team:
+
+   ```bash
+   $ git checkout -b a-descriptive-name-for-my-project
+   ```
+
+4. Set up a PyTorch environment by running the following command your virtual environment:
+
+   ```bash
+   $ pip install -e ".[torch-speech]"
+   ```
+
+   (If transformers was already installed in the virtual environment, remove
+   it with `pip uninstall transformers` before reinstalling it in editable
+   mode with the `-e` flag.)
+
+   If you have already cloned that repo, you might need to `git pull` to get the most recent changes in the `transformers`
+   library.
+
+   Running this command will automatically install `pytorch` and the most relevant 
+   libraries required for fine-tuning a speech recognition system.
+
+Next, you should also install the 🤗 Datasets library. We strongly recommend installing the 
+library from source to profit from the most current additions during the community week.
+
+Simply run the following steps:
+
+```
+$ cd ~/
+$ git clone https://github.com/huggingface/datasets.git
+$ cd datasets
+$ pip install -e ".[streaming]"
+```
+
+If you plan on contributing a specific dataset during 
+the community week, please fork the datasets repository and follow the instructions 
+[here](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-create-a-pull-request).
+
+To verify that all libraries are correctly installed, you can run the following command in a Python shell.
+It verifies that both `transformers` and `datasets` have been correclty installed.
+
+```python
+from transformers import AutoModelForCTC, AutoProcessor
+from datasets import load_dataset
+
+dummy_dataset = load_dataset("common_voice", "ab", split="test")
+
+model = AutoModelForCTC.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+model.to("cuda")
+
+processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+
+input_values = processor(dummy_dataset[0]["audio"]["array"], return_tensors="pt", sampling_rate=16_000).input_values
+input_values = input_values.to("cuda")
+
+logits = model(input_values).logits
+
+assert logits.shape[-1] == 32
+```
+
+## How to finetune a model
+
+In this section, we show you how to fine-tune a pre-trained [XLS-R Model](https://huggingface.co/docs/transformers/model_doc/xls_r) on the [Common Voice 7 dataset](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0). 
+
+We recommend fine-tuning one of the following pre-trained XLS-R checkpoints:
+
+- [300M parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-300m)
+- [1B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-1b)
+- [2B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-2b)
+
+To begin with, please note that to use the Common Voice dataset, you 
+have to accept that **your email address** and **username** are shared with the 
+mozilla-foundation. To get access to the dataset please click on "*Access repository*" [here](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0).
+
+Next, we recommended that you get familiar with the XLS-R model and its capabilities.
+In collaboration with [Fairseq's Wav2Vec2 team](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec), 
+we've written ["Fine-tuning XLS-R for Multi-Lingual ASR with 🤗 Transformers"](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) which gives an in-detail explanation of how XLS-R functions and how it can be fine-tuned.
+
+The blog can also be opened and directly fine-tuned in a google colab notebook.
+In this section, we will explain how to fine-tune the model on a local machine.
+
+1. **Log in**
+
+To begin with you should check that you are correctly logged in and that you have `git-lfs` installed so that your fine-tuned model can automatically be uploaded.
+
+Run:
+
+```bash
+huggingface-cli login
+```
+
+to login. It is recommend to login with your personal access token that can be found under your hugging face profile (icon in the top right corner on [hf.co](http://hf.co/), then Settings -> Access Tokens -> User Access Tokens -> New Token (if haven't generated one already)
+
+You can then copy-paste this token to log in locally.
+
+2. **Create your model repository**
+
+First, let's make sure that `git-lfs` is correctly installed. To so, simply run:
+
+```bash
+git-lfs -v
+```
+
+The output should show something like `git-lfs/2.13.2 (GitHub; linux amd64; go 1.15.4)`. If your console states that the `git-lfs` command was not found, please make
+sure to install it [here](https://git-lfs.github.com/) or simply via: 
+
+```bash
+sudo apt-get install git-lfs
+```
+
+Now you can create your model repository which will contain all relevant files to 
+reproduce your training. You can either directly create the model repository on the 
+Hub (Settings -> New Model) or via the CLI. Here we choose to use the CLI instead.
+
+Assuming that we want to call our model repository *xls-r-ab-test*, we can run the 
+following command:
+
+```bash
+huggingface-cli repo create xls-r-ab-test
+```
+
+You can now see the model on the Hub, *e.g.* under https://huggingface.co/hf-test/xls-r-ab-test .
+
+Let's clone the repository so that we can define our training script inside.
+
+```bash
+git lfs install
+git clone https://huggingface.co/hf-test/xls-r-ab-test
+```
+
+3. **Add your training script and `run`-command to the repository**
+
+We encourage participants to add all relevant files for training directly to the 
+directory so that everything is fully reproducible.
+
+Let's first copy-paste the official training script from our clone 
+of `transformers` to our just created directory:
+
+```bash
+cp ~/transformers/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py ./
+```
+
+Next, we'll create a bash file to define the hyper-parameters and configurations 
+for training. More detailed information on different settings (single-GPU vs. multi-GPU) can be found [here](https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition#connectionist-temporal-classification).
+
+For demonstration purposes, we will use a dummy XLS-R model `model_name_or_path="hf-test/xls-r-dummy"` on the very low-resource language of "Abkhaz" of [Common Voice 7](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0): `dataset_config_name="ab"` for just a single epoch.
+
+Before starting to train, let's make sure we have installed all the required libraries. You might want to run:
+
+```bash
+pip install -r ~/transformers/examples/pytorch/speech-recognition/requirements.txt
+```
+
+Alright, finally we can define the training script. We'll simply use some 
+dummy hyper-parameters and configurations for demonstration purposes.
+
+Note that we add the flag `--use_auth_token` so that datasets requiring access, 
+such as [Common Voice 7](https://huggingface.co/datasets/mozilla-foundation/common_voice_7_0) can be downloaded. In addition, we add the `--push_to_hub` flag to make use of the 
+[Trainers `push_to-hub` functionality](https://huggingface.co/docs/transformers/master/en/main_classes/trainer#transformers.Trainer.push_to_hub) so that your model will be automatically uploaded to the Hub.
+
+Let's copy the following code snippet in a file called `run.sh`
+
+```bash
+echo '''python run_speech_recognition_ctc.py \
+	--dataset_name="mozilla-foundation/common_voice_7_0" \
+	--model_name_or_path="hf-test/xls-r-dummy" \
+	--dataset_config_name="ab" \
+	--output_dir="./" \
+	--overwrite_output_dir \
+	--max_steps="10" \
+	--per_device_train_batch_size="2" \
+	--learning_rate="3e-4" \
+	--save_total_limit="1" \
+	--evaluation_strategy="steps" \
+	--text_column_name="sentence" \
+	--length_column_name="input_length" \
+	--save_steps="5" \
+	--layerdrop="0.0" \
+	--freeze_feature_encoder \
+	--gradient_checkpointing \
+	--fp16 \
+	--group_by_length \
+	--push_to_hub \
+	--use_auth_token \
+	--do_train --do_eval''' > run.sh
+```
+
+4. **Start training**
+
+Now all that is left to do is to start training the model by executing the 
+run file.
+
+```bash
+bash run.sh
+```
+
+The training should not take more than a couple of minutes. 
+During the training intermediate saved checkpoints are automatically uploaded to
+your model repository as can be seen [on this commit](https://huggingface.co/hf-test/xls-r-ab-test/commit/0eb19a0fca4d7d163997b59663d98cd856022aa6) . 
+
+At the end of the training, the [Trainer](https://huggingface.co/docs/transformers/master/en/main_classes/trainer) automatically creates a nice model card and all 
+relevant files are uploaded.
+
+5. **Tips for real model training**
+
+The above steps illustrate how a model can technically be fine-tuned.
+However as you can see on the model card [hf-test/xls-r-ab-test](https://huggingface.co/hf-test/xls-r-ab-test), our demonstration has a very poor performance which is
+not surprising given that we trained for just 10 steps on a randomly initialized
+model.
+
+For a real model training, one of the actual pre-trained XLS-R models should be used:
+
+- [300M parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-300m)
+- [1B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-1b)
+- [2B parameters version](https://huggingface.co/facebook/wav2vec2-xls-r-2b)
+
+Also, the hyper-parameters should be carefully chosen depending on the dataset.
+As an example, we will fine-tune the 300M parameters model on Swedish on a single 
+TITAN RTX 24GB GPU.
+
+The model will be called `"xls-r-300m-sv"`. 
+Following the above steps we first create the model:
+
+```bash
+huggingface-cli repo create xls-r-300m-sv
+```
+
+and then clone it locally:
+
+```bash
+
+
+and we define the following 
+
+hyperparameters for training
+
+```bash
+
+echo '''python run_speech_recognition_ctc.py \
+	--dataset_name="mozilla-foundation/common_voice_7_0" \
+	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
+	--dataset_config_name="sv-SE" \
+	--output_dir="./" \
+	--overwrite_output_dir \
+	--num_train_epochs="50" \
+	--per_device_train_batch_size="8" \
+	--per_device_eval_batch_size="8" \
+	--gradient_accumulation_steps="4" \
+	--learning_rate="7.5e-5" \
+	--warmup_steps="2000" \
+	--length_column_name="input_length" \
+	--evaluation_strategy="steps" \
+	--text_column_name="sentence" \
+	--chars_to_ignore , ? . ! \- \; \: \" “ % ‘ ” � — ’ … – \
+	--save_steps="500" \
+	--eval_steps="500" \
+	--logging_steps="100" \
+	--layerdrop="0.0" \
+	--activation_dropout="0.1" \
+	--save_total_limit="3" \
+	--freeze_feature_encoder \
+	--feat_proj_dropout="0.0" \
+	--mask_time_prob="0.75" \
+	--mask_time_length="10" \
+	--mask_feature_prob="0.25" \
+	--mask_feature_length="64" \
+	--gradient_checkpointing \
+	--use_auth_token \
+	--fp16 \
+	--group_by_length \
+	--do_train --do_eval \
+	--push_to_hub''' > run.sh
+```
+
+The training takes *ca.* 7 hours and yields a reasonable test word 
+error rate of 27% as can be seen on the automatically generated [model card](https://huggingface.co/hf-test/xls-r-300m-sv).
+
+The above-chosen hyperparameters probably work quite well on a range of different 
+datasets and languages, but are by no means optimal. It is up to you to find a good set of 
+hyperparameters.