Black preview (#17217)

* Black preview

* Fixup too!

* Fix check copies

* Use the same version as the CI

* Bump black
This commit is contained in:
Sylvain Gugger
2022-05-12 16:25:55 -04:00
committed by GitHub
parent 9bd67ac7bb
commit afe5d42d8d
578 changed files with 8274 additions and 3296 deletions

View File

@@ -24,7 +24,7 @@ def log_results(result: Dataset, args: Dict[str, str]):
cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
# print & log results
result_str = f"WER: {wer_result}\n" f"CER: {cer_result}"
result_str = f"WER: {wer_result}\nCER: {cer_result}"
print(result_str)
with open(f"{dataset_id}_eval_results.txt", "w") as f:

View File

@@ -103,9 +103,11 @@ class ModelArguments:
mask_time_prob: float = field(
default=0.05,
metadata={
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
"vectors will be masked along the time axis."
"help": (
"Probability of each feature vector along the time axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
"vectors will be masked along the time axis."
)
},
)
mask_time_length: int = field(
@@ -115,8 +117,11 @@ class ModelArguments:
mask_feature_prob: float = field(
default=0.0,
metadata={
"help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
"help": (
"Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
" to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
" bins will be masked along the time axis."
)
},
)
mask_feature_length: int = field(
@@ -175,15 +180,19 @@ class DataTrainingArguments:
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
"help": (
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
"help": (
"For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
)
},
)
chars_to_ignore: Optional[List[str]] = list_field(
@@ -197,7 +206,10 @@ class DataTrainingArguments:
max_duration_in_seconds: float = field(
default=20.0,
metadata={
"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
"help": (
"Filter audio files that are longer than `max_duration_in_seconds` seconds to"
" 'max_duration_in_seconds`"
)
},
)
min_duration_in_seconds: float = field(
@@ -206,17 +218,21 @@ class DataTrainingArguments:
preprocessing_only: bool = field(
default=False,
metadata={
"help": "Whether to only do data preprocessing and skip training. "
"This is especially useful when data preprocessing errors out in distributed training due to timeout. "
"In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
"so that the cached datasets can consequently be loaded in distributed training"
"help": (
"Whether to only do data preprocessing and skip training. This is especially useful when data"
" preprocessing errors out in distributed training due to timeout. In this case, one should run the"
" preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
" can consequently be loaded in distributed training"
)
},
)
use_auth_token: bool = field(
default=False,
metadata={
"help": "If :obj:`True`, will use the token generated when running"
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
"help": (
"If :obj:`True`, will use the token generated when running"
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
)
},
)
unk_token: str = field(
@@ -234,10 +250,12 @@ class DataTrainingArguments:
phoneme_language: Optional[str] = field(
default=None,
metadata={
"help": "The target language that should be used be"
" passed to the tokenizer for tokenization. Note that"
" this is only relevant if the model classifies the"
" input audio to a sequence of phoneme sequences."
"help": (
"The target language that should be used be"
" passed to the tokenizer for tokenization. Note that"
" this is only relevant if the model classifies the"
" input audio to a sequence of phoneme sequences."
)
},
)
@@ -406,9 +424,9 @@ def main():
if data_args.audio_column_name not in raw_datasets["train"].column_names:
raise ValueError(
f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--audio_column_name` to the correct audio column - one of "
f"{', '.join(raw_datasets['train'].column_names)}."
f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
" Make sure to set `--audio_column_name` to the correct audio column - one of"
f" {', '.join(raw_datasets['train'].column_names)}."
)
if data_args.text_column_name not in raw_datasets["train"].column_names:
@@ -743,7 +761,10 @@ def main():
"finetuned_from": model_args.model_name_or_path,
"tasks": "speech-recognition",
"tags": ["automatic-speech-recognition", data_args.dataset_name],
"dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
"dataset_args": (
f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
f" {data_args.eval_split_name}"
),
"dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
}
if "common_voice" in data_args.dataset_name:

View File

@@ -102,9 +102,11 @@ class ModelArguments:
mask_time_prob: float = field(
default=0.05,
metadata={
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
"vectors will be masked along the time axis."
"help": (
"Probability of each feature vector along the time axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
"vectors will be masked along the time axis."
)
},
)
mask_time_length: int = field(
@@ -114,8 +116,11 @@ class ModelArguments:
mask_feature_prob: float = field(
default=0.0,
metadata={
"help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
"help": (
"Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
" to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
" bins will be masked along the time axis."
)
},
)
mask_feature_length: int = field(
@@ -147,8 +152,10 @@ class DataTrainingArguments:
train_split_name: str = field(
default="train+validation",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to "
"'train+validation'"
"help": (
"The name of the training data set split to use (via the datasets library). Defaults to "
"'train+validation'"
)
},
)
eval_split_name: str = field(
@@ -175,22 +182,28 @@ class DataTrainingArguments:
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
"help": (
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
)
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
"help": (
"For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
)
},
)
shuffle_buffer_size: Optional[int] = field(
default=500,
metadata={
"help": "The number of streamed examples to download before shuffling them. The large the buffer, "
"the closer it is to real offline shuffling."
"help": (
"The number of streamed examples to download before shuffling them. The large the buffer, "
"the closer it is to real offline shuffling."
)
},
)
chars_to_ignore: Optional[List[str]] = list_field(
@@ -208,26 +221,32 @@ class DataTrainingArguments:
preprocessing_only: bool = field(
default=False,
metadata={
"help": "Whether to only do data preprocessing and skip training. "
"This is especially useful when data preprocessing errors out in distributed training due to timeout. "
"In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
"so that the cached datasets can consequently be loaded in distributed training"
"help": (
"Whether to only do data preprocessing and skip training. This is especially useful when data"
" preprocessing errors out in distributed training due to timeout. In this case, one should run the"
" preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
" can consequently be loaded in distributed training"
)
},
)
use_auth_token: bool = field(
default=False,
metadata={
"help": "If :obj:`True`, will use the token generated when running"
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
"help": (
"If :obj:`True`, will use the token generated when running"
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
)
},
)
phoneme_language: Optional[str] = field(
default=None,
metadata={
"help": "The target language that should be used be"
" passed to the tokenizer for tokenization. Note that"
" this is only relevant if the model classifies the"
" input audio to a sequence of phoneme sequences."
"help": (
"The target language that should be used be"
" passed to the tokenizer for tokenization. Note that"
" this is only relevant if the model classifies the"
" input audio to a sequence of phoneme sequences."
)
},
)
@@ -393,9 +412,9 @@ def main():
if data_args.audio_column_name not in raw_column_names["train"]:
raise ValueError(
f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--audio_column_name` to the correct audio column - one of "
f"{', '.join(raw_column_names['train'])}."
f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
" Make sure to set `--audio_column_name` to the correct audio column - one of"
f" {', '.join(raw_column_names['train'])}."
)
if data_args.text_column_name not in raw_column_names["train"]:
@@ -641,7 +660,10 @@ def main():
"finetuned_from": model_args.model_name_or_path,
"tasks": "speech-recognition",
"tags": ["automatic-speech-recognition", data_args.dataset_name],
"dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
"dataset_args": (
f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
f" {data_args.eval_split_name}"
),
"dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
}
if "common_voice" in data_args.dataset_name: