[xtreme-s] Update Minds14 results (#16241)
* update results * per-language metrics * Format the per-language metrics
This commit is contained in:
@@ -67,7 +67,7 @@ The corresponding training commands for each dataset are given in the sections b
|
|||||||
| Speech Recognition | VoxPopuli | - | - | - | - |
|
| Speech Recognition | VoxPopuli | - | - | - | - |
|
||||||
| Speech Recognition | FLEURS | - | - | - | - |
|
| Speech Recognition | FLEURS | - | - | - | - |
|
||||||
| Speech Translation | CoVoST-2 | - | - | - | - |
|
| Speech Translation | CoVoST-2 | - | - | - | - |
|
||||||
| Speech Classification | Minds-14 | 94.74 F1 / 94.70 Acc. | [here](https://huggingface.co/anton-l/xtreme_s_xlsr_300m_minds14/) | 04:46:40 | 2xA100 |
|
| Speech Classification | Minds-14 | 90.15 F1 / 90.33 Acc. | [here](https://huggingface.co/anton-l/xtreme_s_xlsr_300m_minds14/) | 2:54:21 | 2xA100 |
|
||||||
| Speech Classification | FLEURS | - | - | - | - |
|
| Speech Classification | FLEURS | - | - | - | - |
|
||||||
| Speech Retrieval | FLEURS | - | - | - | - |
|
| Speech Retrieval | FLEURS | - | - | - | - |
|
||||||
|
|
||||||
@@ -82,7 +82,6 @@ python -m torch.distributed.launch \
|
|||||||
--task="mls" \
|
--task="mls" \
|
||||||
--language="all" \
|
--language="all" \
|
||||||
--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
|
--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
|
||||||
--eval_split_name="test" \
|
|
||||||
--output_dir="xtreme_s_xlsr_300m_mls" \
|
--output_dir="xtreme_s_xlsr_300m_mls" \
|
||||||
--overwrite_output_dir \
|
--overwrite_output_dir \
|
||||||
--num_train_epochs=100 \
|
--num_train_epochs=100 \
|
||||||
@@ -158,4 +157,4 @@ python -m torch.distributed.launch \
|
|||||||
--push_to_hub
|
--push_to_hub
|
||||||
```
|
```
|
||||||
|
|
||||||
On 2 A100 GPUs, this script should run in ~5 hours and yield a cross-entropy loss of **0.2890** and F1 score of **94.74**
|
On 2 A100 GPUs, this script should run in ~5 hours and yield a cross-entropy loss of **0.4119** and F1 score of **90.15**
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
from collections import OrderedDict, defaultdict
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Dict, List, Optional, Union
|
from typing import Dict, List, Optional, Union
|
||||||
|
|
||||||
@@ -273,6 +274,13 @@ class DataTrainingArguments:
|
|||||||
" input audio to a sequence of phoneme sequences."
|
" input audio to a sequence of phoneme sequences."
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
per_lang_metrics: bool = field(
|
||||||
|
default=True,
|
||||||
|
metadata={
|
||||||
|
"help": "If `True`, compute the test metrics separately for each language, and average the results. "
|
||||||
|
"If `False` compute the average test metrics in a single pass for all languages at once."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -470,10 +478,6 @@ def main():
|
|||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
|
raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
|
||||||
|
|
||||||
if not is_text_target:
|
|
||||||
label_list = raw_datasets["train"].features[target_column_name].names
|
|
||||||
num_labels = len(label_list)
|
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
raw_datasets["eval"] = load_dataset(
|
raw_datasets["eval"] = load_dataset(
|
||||||
data_args.dataset_name,
|
data_args.dataset_name,
|
||||||
@@ -498,6 +502,11 @@ def main():
|
|||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
raw_datasets["predict"] = raw_datasets["predict"].select(range(data_args.max_predict_samples))
|
raw_datasets["predict"] = raw_datasets["predict"].select(range(data_args.max_predict_samples))
|
||||||
|
|
||||||
|
if not is_text_target:
|
||||||
|
label_list = next(iter(raw_datasets.values())).features[target_column_name].names
|
||||||
|
lang_list = next(iter(raw_datasets.values())).features["lang_id"].names
|
||||||
|
num_labels = len(label_list)
|
||||||
|
|
||||||
# 2. We remove some special characters from the datasets
|
# 2. We remove some special characters from the datasets
|
||||||
# that make training complicated and do not help in transcribing the speech
|
# that make training complicated and do not help in transcribing the speech
|
||||||
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
|
||||||
@@ -593,31 +602,33 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# adapt config
|
# adapt config
|
||||||
config.update(
|
# (speech translation requires pre-configured seq2seq models)
|
||||||
{
|
if task_name != "covost2":
|
||||||
"feat_proj_dropout": model_args.feat_proj_dropout,
|
config.update(
|
||||||
"attention_dropout": model_args.attention_dropout,
|
{
|
||||||
"hidden_dropout": model_args.hidden_dropout,
|
"feat_proj_dropout": model_args.feat_proj_dropout,
|
||||||
"final_dropout": model_args.final_dropout,
|
"attention_dropout": model_args.attention_dropout,
|
||||||
"mask_time_prob": model_args.mask_time_prob,
|
"hidden_dropout": model_args.hidden_dropout,
|
||||||
"mask_time_length": model_args.mask_time_length,
|
"final_dropout": model_args.final_dropout,
|
||||||
"mask_feature_prob": model_args.mask_feature_prob,
|
"mask_time_prob": model_args.mask_time_prob,
|
||||||
"mask_feature_length": model_args.mask_feature_length,
|
"mask_time_length": model_args.mask_time_length,
|
||||||
"gradient_checkpointing": training_args.gradient_checkpointing,
|
"mask_feature_prob": model_args.mask_feature_prob,
|
||||||
"layerdrop": model_args.layerdrop,
|
"mask_feature_length": model_args.mask_feature_length,
|
||||||
"ctc_loss_reduction": model_args.ctc_loss_reduction,
|
"gradient_checkpointing": training_args.gradient_checkpointing,
|
||||||
"activation_dropout": model_args.activation_dropout,
|
"layerdrop": model_args.layerdrop,
|
||||||
}
|
"ctc_loss_reduction": model_args.ctc_loss_reduction,
|
||||||
)
|
"activation_dropout": model_args.activation_dropout,
|
||||||
if training_args.do_train:
|
}
|
||||||
if is_text_target:
|
)
|
||||||
config.pad_token_id = tokenizer.pad_token_id
|
if training_args.do_train:
|
||||||
config.vocab_size = len(tokenizer)
|
if is_text_target:
|
||||||
else:
|
config.pad_token_id = tokenizer.pad_token_id
|
||||||
label_to_id = {v: i for i, v in enumerate(label_list)}
|
config.vocab_size = len(tokenizer)
|
||||||
config.label2id = label_to_id
|
else:
|
||||||
config.id2label = {id: label for label, id in label_to_id.items()}
|
label_to_id = {v: i for i, v in enumerate(label_list)}
|
||||||
config.num_labels = num_labels
|
config.label2id = label_to_id
|
||||||
|
config.id2label = {id: label for label, id in label_to_id.items()}
|
||||||
|
config.num_labels = num_labels
|
||||||
|
|
||||||
# create model
|
# create model
|
||||||
if target_column_name == "transcription":
|
if target_column_name == "transcription":
|
||||||
@@ -688,6 +699,9 @@ def main():
|
|||||||
batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
|
batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids
|
||||||
else:
|
else:
|
||||||
batch["labels"] = batch[target_column_name]
|
batch["labels"] = batch[target_column_name]
|
||||||
|
|
||||||
|
batch["lang"] = batch["lang_id"]
|
||||||
|
|
||||||
return batch
|
return batch
|
||||||
|
|
||||||
with training_args.main_process_first(desc="dataset map preprocessing"):
|
with training_args.main_process_first(desc="dataset map preprocessing"):
|
||||||
@@ -752,7 +766,8 @@ def main():
|
|||||||
tokenizer.save_pretrained(training_args.output_dir)
|
tokenizer.save_pretrained(training_args.output_dir)
|
||||||
config.save_pretrained(training_args.output_dir)
|
config.save_pretrained(training_args.output_dir)
|
||||||
# wait until configs are saved in the main process before loading the processor
|
# wait until configs are saved in the main process before loading the processor
|
||||||
torch.distributed.barrier()
|
if training_args.local_rank != -1:
|
||||||
|
torch.distributed.barrier()
|
||||||
|
|
||||||
if is_text_target:
|
if is_text_target:
|
||||||
processor = AutoProcessor.from_pretrained(training_args.output_dir)
|
processor = AutoProcessor.from_pretrained(training_args.output_dir)
|
||||||
@@ -816,7 +831,22 @@ def main():
|
|||||||
results = {}
|
results = {}
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
logger.info(f"*** Evaluating on the `{data_args.predict_split_name}` set ***")
|
logger.info(f"*** Evaluating on the `{data_args.predict_split_name}` set ***")
|
||||||
metrics = trainer.evaluate(vectorized_datasets["predict"])
|
if data_args.per_lang_metrics:
|
||||||
|
# separate the `test` dataset into language-specific subsets and compute metrics for each of them
|
||||||
|
metrics = {}
|
||||||
|
average_metrics = defaultdict(list)
|
||||||
|
for lang_id in range(len(lang_list)):
|
||||||
|
lang_name = lang_list[lang_id]
|
||||||
|
lang_dataset = vectorized_datasets["predict"].filter(lambda example: example["lang"] == lang_id)
|
||||||
|
lang_metrics = trainer.evaluate(lang_dataset)
|
||||||
|
for metric_name, value in lang_metrics.items():
|
||||||
|
average_metrics[metric_name].append(value)
|
||||||
|
if metric_name not in ["eval_runtime", "eval_samples_per_second", "eval_steps_per_second"]:
|
||||||
|
metrics[f"{metric_name}_{lang_name}"] = value
|
||||||
|
for metric_name, value in average_metrics.items():
|
||||||
|
metrics[metric_name] = np.mean(value)
|
||||||
|
else:
|
||||||
|
metrics = trainer.evaluate(vectorized_datasets["predict"])
|
||||||
max_predict_samples = (
|
max_predict_samples = (
|
||||||
data_args.max_predict_samples
|
data_args.max_predict_samples
|
||||||
if data_args.max_predict_samples is not None
|
if data_args.max_predict_samples is not None
|
||||||
@@ -824,6 +854,9 @@ def main():
|
|||||||
)
|
)
|
||||||
metrics["predict_samples"] = min(max_predict_samples, len(vectorized_datasets["predict"]))
|
metrics["predict_samples"] = min(max_predict_samples, len(vectorized_datasets["predict"]))
|
||||||
|
|
||||||
|
# make sure that the `predict` metrics end up in the log history for the model card
|
||||||
|
trainer.log(OrderedDict(sorted(metrics.items())))
|
||||||
|
|
||||||
trainer.log_metrics("predict", metrics)
|
trainer.log_metrics("predict", metrics)
|
||||||
trainer.save_metrics("predict", metrics)
|
trainer.save_metrics("predict", metrics)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user