Black preview (#17217)
* Black preview * Fixup too! * Fix check copies * Use the same version as the CI * Bump black
This commit is contained in:
@@ -101,9 +101,11 @@ class ModelArguments:
|
||||
mask_time_prob: float = field(
|
||||
default=0.05,
|
||||
metadata={
|
||||
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
|
||||
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
|
||||
"vectors will be masked along the time axis."
|
||||
"help": (
|
||||
"Probability of each feature vector along the time axis to be chosen as the start of the vector"
|
||||
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
|
||||
"vectors will be masked along the time axis."
|
||||
)
|
||||
},
|
||||
)
|
||||
mask_time_length: int = field(
|
||||
@@ -113,8 +115,11 @@ class ModelArguments:
|
||||
mask_feature_prob: float = field(
|
||||
default=0.0,
|
||||
metadata={
|
||||
"help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
|
||||
"span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
|
||||
"help": (
|
||||
"Probability of each feature vector along the feature axis to be chosen as the start of the vectorspan"
|
||||
" to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature"
|
||||
" bins will be masked along the time axis."
|
||||
)
|
||||
},
|
||||
)
|
||||
mask_feature_length: int = field(
|
||||
@@ -146,8 +151,10 @@ class DataTrainingArguments:
|
||||
train_split_name: str = field(
|
||||
default="train+validation",
|
||||
metadata={
|
||||
"help": "The name of the training data set split to use (via the datasets library). Defaults to "
|
||||
"'train+validation'"
|
||||
"help": (
|
||||
"The name of the training data set split to use (via the datasets library). Defaults to "
|
||||
"'train+validation'"
|
||||
)
|
||||
},
|
||||
)
|
||||
eval_split_name: str = field(
|
||||
@@ -174,15 +181,19 @@ class DataTrainingArguments:
|
||||
max_train_samples: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
|
||||
"value if set."
|
||||
"help": (
|
||||
"For debugging purposes or quicker training, truncate the number of training examples to this "
|
||||
"value if set."
|
||||
)
|
||||
},
|
||||
)
|
||||
max_eval_samples: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
|
||||
"value if set."
|
||||
"help": (
|
||||
"For debugging purposes or quicker training, truncate the number of validation examples to this "
|
||||
"value if set."
|
||||
)
|
||||
},
|
||||
)
|
||||
chars_to_ignore: Optional[List[str]] = list_field(
|
||||
@@ -196,7 +207,10 @@ class DataTrainingArguments:
|
||||
max_duration_in_seconds: float = field(
|
||||
default=20.0,
|
||||
metadata={
|
||||
"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
|
||||
"help": (
|
||||
"Filter audio files that are longer than `max_duration_in_seconds` seconds to"
|
||||
" 'max_duration_in_seconds`"
|
||||
)
|
||||
},
|
||||
)
|
||||
min_duration_in_seconds: float = field(
|
||||
@@ -205,17 +219,21 @@ class DataTrainingArguments:
|
||||
preprocessing_only: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": "Whether to only do data preprocessing and skip training. "
|
||||
"This is especially useful when data preprocessing errors out in distributed training due to timeout. "
|
||||
"In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
|
||||
"so that the cached datasets can consequently be loaded in distributed training"
|
||||
"help": (
|
||||
"Whether to only do data preprocessing and skip training. This is especially useful when data"
|
||||
" preprocessing errors out in distributed training due to timeout. In this case, one should run the"
|
||||
" preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
|
||||
" can consequently be loaded in distributed training"
|
||||
)
|
||||
},
|
||||
)
|
||||
use_auth_token: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": "If :obj:`True`, will use the token generated when running"
|
||||
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
|
||||
"help": (
|
||||
"If :obj:`True`, will use the token generated when running"
|
||||
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
|
||||
)
|
||||
},
|
||||
)
|
||||
unk_token: str = field(
|
||||
@@ -233,10 +251,12 @@ class DataTrainingArguments:
|
||||
phoneme_language: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "The target language that should be used be"
|
||||
" passed to the tokenizer for tokenization. Note that"
|
||||
" this is only relevant if the model classifies the"
|
||||
" input audio to a sequence of phoneme sequences."
|
||||
"help": (
|
||||
"The target language that should be used be"
|
||||
" passed to the tokenizer for tokenization. Note that"
|
||||
" this is only relevant if the model classifies the"
|
||||
" input audio to a sequence of phoneme sequences."
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
@@ -405,9 +425,9 @@ def main():
|
||||
|
||||
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
||||
raise ValueError(
|
||||
f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. "
|
||||
"Make sure to set `--audio_column_name` to the correct audio column - one of "
|
||||
f"{', '.join(raw_datasets['train'].column_names)}."
|
||||
f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'."
|
||||
" Make sure to set `--audio_column_name` to the correct audio column - one of"
|
||||
f" {', '.join(raw_datasets['train'].column_names)}."
|
||||
)
|
||||
|
||||
if data_args.text_column_name not in raw_datasets["train"].column_names:
|
||||
@@ -720,7 +740,10 @@ def main():
|
||||
"finetuned_from": model_args.model_name_or_path,
|
||||
"tasks": "speech-recognition",
|
||||
"tags": ["automatic-speech-recognition", data_args.dataset_name],
|
||||
"dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
|
||||
"dataset_args": (
|
||||
f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:"
|
||||
f" {data_args.eval_split_name}"
|
||||
),
|
||||
"dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}",
|
||||
}
|
||||
if "common_voice" in data_args.dataset_name:
|
||||
|
||||
@@ -87,8 +87,10 @@ class ModelArguments:
|
||||
use_auth_token: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
|
||||
"with private models)."
|
||||
"help": (
|
||||
"Will use the token generated when running `transformers-cli login` (necessary to use this script "
|
||||
"with private models)."
|
||||
)
|
||||
},
|
||||
)
|
||||
freeze_feature_encoder: bool = field(
|
||||
@@ -122,15 +124,19 @@ class DataTrainingArguments:
|
||||
max_train_samples: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
|
||||
"value if set."
|
||||
"help": (
|
||||
"For debugging purposes or quicker training, truncate the number of training examples to this "
|
||||
"value if set."
|
||||
)
|
||||
},
|
||||
)
|
||||
max_eval_samples: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
|
||||
"value if set."
|
||||
"help": (
|
||||
"For debugging purposes or quicker training, truncate the number of evaluation examples to this "
|
||||
"value if set."
|
||||
)
|
||||
},
|
||||
)
|
||||
audio_column_name: str = field(
|
||||
@@ -144,7 +150,10 @@ class DataTrainingArguments:
|
||||
max_duration_in_seconds: float = field(
|
||||
default=20.0,
|
||||
metadata={
|
||||
"help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
|
||||
"help": (
|
||||
"Truncate audio files that are longer than `max_duration_in_seconds` seconds to"
|
||||
" 'max_duration_in_seconds`"
|
||||
)
|
||||
},
|
||||
)
|
||||
min_duration_in_seconds: float = field(
|
||||
@@ -153,10 +162,12 @@ class DataTrainingArguments:
|
||||
preprocessing_only: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": "Whether to only do data preprocessing and skip training. "
|
||||
"This is especially useful when data preprocessing errors out in distributed training due to timeout. "
|
||||
"In this case, one should run the preprocessing in a non-distributed setup with `preprocessing_only=True` "
|
||||
"so that the cached datasets can consequently be loaded in distributed training"
|
||||
"help": (
|
||||
"Whether to only do data preprocessing and skip training. This is especially useful when data"
|
||||
" preprocessing errors out in distributed training due to timeout. In this case, one should run the"
|
||||
" preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
|
||||
" can consequently be loaded in distributed training"
|
||||
)
|
||||
},
|
||||
)
|
||||
train_split_name: str = field(
|
||||
|
||||
Reference in New Issue
Block a user