From e226a24f84581d3ed2b0cb1546791a72b689581e Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <aglozhkov@gmail.com>
Date: Mon, 21 Mar 2022 22:33:59 +0400
Subject: [PATCH] [xtreme-s] Update Minds14 results (#16241)

* update results

* per-language metrics

* Format the per-language metrics
---
 examples/research_projects/xtreme-s/README.md |  5 +-
 .../xtreme-s/run_xtreme_s.py                  | 95 +++++++++++++------
 2 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/examples/research_projects/xtreme-s/README.md b/examples/research_projects/xtreme-s/README.md
index 3c74f634eb..8f436fa289 100644
--- a/examples/research_projects/xtreme-s/README.md
+++ b/examples/research_projects/xtreme-s/README.md
@@ -67,7 +67,7 @@ The corresponding training commands for each dataset are given in the sections b
 | Speech Recognition    | VoxPopuli | -                     | -                                                                  | -             | -      |
 | Speech Recognition    | FLEURS    | -                     | -                                                                  | -             | -      |
 | Speech Translation    | CoVoST-2  | -                     | -                                                                  | -             | -      |
-| Speech Classification | Minds-14  | 94.74 F1 / 94.70 Acc. | [here](https://huggingface.co/anton-l/xtreme_s_xlsr_300m_minds14/) | 04:46:40      | 2xA100 |
+| Speech Classification | Minds-14  | 90.15 F1 / 90.33 Acc. | [here](https://huggingface.co/anton-l/xtreme_s_xlsr_300m_minds14/) | 2:54:21       | 2xA100 |
 | Speech Classification | FLEURS    | -                     | -                                                                  | -             | -      |
 | Speech Retrieval      | FLEURS    | -                     | -                                                                  | -             | -      |
 
@@ -82,7 +82,6 @@ python -m torch.distributed.launch \
     --task="mls" \
     --language="all" \
     --model_name_or_path="facebook/wav2vec2-xls-r-300m" \
-    --eval_split_name="test" \
     --output_dir="xtreme_s_xlsr_300m_mls" \
     --overwrite_output_dir \
     --num_train_epochs=100 \
@@ -158,4 +157,4 @@ python -m torch.distributed.launch \
     --push_to_hub
 ```
 
-On 2 A100 GPUs, this script should run in ~5 hours and yield a cross-entropy loss of **0.2890** and F1 score of **94.74**
+On 2 A100 GPUs, this script should run in ~5 hours and yield a cross-entropy loss of **0.4119** and F1 score of **90.15**
diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py
index 227380962f..b6a6e7ae2c 100644
--- a/examples/research_projects/xtreme-s/run_xtreme_s.py
+++ b/examples/research_projects/xtreme-s/run_xtreme_s.py
@@ -20,6 +20,7 @@ import logging
 import os
 import re
 import sys
+from collections import OrderedDict, defaultdict
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
 
@@ -273,6 +274,13 @@ class DataTrainingArguments:
             " input audio to a sequence of phoneme sequences."
         },
     )
+    per_lang_metrics: bool = field(
+        default=True,
+        metadata={
+            "help": "If `True`, compute the test metrics separately for each language, and average the results. "
+            "If `False` compute the average test metrics in a single pass for all languages at once."
+        },
+    )
 
 
 @dataclass
@@ -470,10 +478,6 @@ def main():
         if data_args.max_train_samples is not None:
             raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
 
-        if not is_text_target:
-            label_list = raw_datasets["train"].features[target_column_name].names
-            num_labels = len(label_list)
-
     if training_args.do_eval:
         raw_datasets["eval"] = load_dataset(
             data_args.dataset_name,
@@ -498,6 +502,11 @@ def main():
         if data_args.max_predict_samples is not None:
             raw_datasets["predict"] = raw_datasets["predict"].select(range(data_args.max_predict_samples))
 
+    if not is_text_target:
+        label_list = next(iter(raw_datasets.values())).features[target_column_name].names
+        lang_list = next(iter(raw_datasets.values())).features["lang_id"].names
+        num_labels = len(label_list)
+
     # 2. We remove some special characters from the datasets
     # that make training complicated and do not help in transcribing the speech
     # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
@@ -593,31 +602,33 @@ def main():
     )
 
     # adapt config
-    config.update(
-        {
-            "feat_proj_dropout": model_args.feat_proj_dropout,
-            "attention_dropout": model_args.attention_dropout,
-            "hidden_dropout": model_args.hidden_dropout,
-            "final_dropout": model_args.final_dropout,
-            "mask_time_prob": model_args.mask_time_prob,
-            "mask_time_length": model_args.mask_time_length,
-            "mask_feature_prob": model_args.mask_feature_prob,
-            "mask_feature_length": model_args.mask_feature_length,
-            "gradient_checkpointing": training_args.gradient_checkpointing,
-            "layerdrop": model_args.layerdrop,
-            "ctc_loss_reduction": model_args.ctc_loss_reduction,
-            "activation_dropout": model_args.activation_dropout,
-        }
-    )
-    if training_args.do_train:
-        if is_text_target:
-            config.pad_token_id = tokenizer.pad_token_id
-            config.vocab_size = len(tokenizer)
-        else:
-            label_to_id = {v: i for i, v in enumerate(label_list)}
-            config.label2id = label_to_id
-            config.id2label = {id: label for label, id in label_to_id.items()}
-            config.num_labels = num_labels
+    # (speech translation requires pre-configured seq2seq models)
+    if task_name != "covost2":
+        config.update(
+            {
+                "feat_proj_dropout": model_args.feat_proj_dropout,
+                "attention_dropout": model_args.attention_dropout,
+                "hidden_dropout": model_args.hidden_dropout,
+                "final_dropout": model_args.final_dropout,
+                "mask_time_prob": model_args.mask_time_prob,
+                "mask_time_length": model_args.mask_time_length,
+                "mask_feature_prob": model_args.mask_feature_prob,
+                "mask_feature_length": model_args.mask_feature_length,
+                "gradient_checkpointing": training_args.gradient_checkpointing,
+                "layerdrop": model_args.layerdrop,
+                "ctc_loss_reduction": model_args.ctc_loss_reduction,
+                "activation_dropout": model_args.activation_dropout,
+            }
+        )
+        if training_args.do_train:
+            if is_text_target:
+                config.pad_token_id = tokenizer.pad_token_id
+                config.vocab_size = len(tokenizer)
+            else:
+                label_to_id = {v: i for i, v in enumerate(label_list)}
+                config.label2id = label_to_id
+                config.id2label = {id: label for label, id in label_to_id.items()}
+                config.num_labels = num_labels
 
     # create model
     if target_column_name == "transcription":
@@ -688,6 +699,9 @@ def main():
             batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
         else:
             batch["labels"] = batch[target_column_name]
+
+        batch["lang"] = batch["lang_id"]
+
         return batch
 
     with training_args.main_process_first(desc="dataset map preprocessing"):
@@ -752,7 +766,8 @@ def main():
             tokenizer.save_pretrained(training_args.output_dir)
         config.save_pretrained(training_args.output_dir)
     # wait until configs are saved in the main process before loading the processor
-    torch.distributed.barrier()
+    if training_args.local_rank != -1:
+        torch.distributed.barrier()
 
     if is_text_target:
         processor = AutoProcessor.from_pretrained(training_args.output_dir)
@@ -816,7 +831,22 @@ def main():
     results = {}
     if training_args.do_predict:
         logger.info(f"*** Evaluating on the `{data_args.predict_split_name}` set ***")
-        metrics = trainer.evaluate(vectorized_datasets["predict"])
+        if data_args.per_lang_metrics:
+            # separate the `test` dataset into language-specific subsets and compute metrics for each of them
+            metrics = {}
+            average_metrics = defaultdict(list)
+            for lang_id in range(len(lang_list)):
+                lang_name = lang_list[lang_id]
+                lang_dataset = vectorized_datasets["predict"].filter(lambda example: example["lang"] == lang_id)
+                lang_metrics = trainer.evaluate(lang_dataset)
+                for metric_name, value in lang_metrics.items():
+                    average_metrics[metric_name].append(value)
+                    if metric_name not in ["eval_runtime", "eval_samples_per_second", "eval_steps_per_second"]:
+                        metrics[f"{metric_name}_{lang_name}"] = value
+            for metric_name, value in average_metrics.items():
+                metrics[metric_name] = np.mean(value)
+        else:
+            metrics = trainer.evaluate(vectorized_datasets["predict"])
         max_predict_samples = (
             data_args.max_predict_samples
             if data_args.max_predict_samples is not None
@@ -824,6 +854,9 @@ def main():
         )
         metrics["predict_samples"] = min(max_predict_samples, len(vectorized_datasets["predict"]))
 
+        # make sure that the `predict` metrics end up in the log history for the model card
+        trainer.log(OrderedDict(sorted(metrics.items())))
+
         trainer.log_metrics("predict", metrics)
         trainer.save_metrics("predict", metrics)