Black preview (#17217)
* Black preview * Fixup too! * Fix check copies * Use the same version as the CI * Bump black
This commit is contained in:
@@ -103,15 +103,20 @@ if __name__ == "__main__":
|
||||
choices=["l0", "magnitude", "topK", "sigmoied_threshold"],
|
||||
type=str,
|
||||
required=True,
|
||||
help="Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning, sigmoied_threshold = Soft movement pruning)",
|
||||
help=(
|
||||
"Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning,"
|
||||
" sigmoied_threshold = Soft movement pruning)"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--threshold",
|
||||
type=float,
|
||||
required=False,
|
||||
help="For `magnitude` and `topK`, it is the level of remaining weights (in %) in the fine-pruned model."
|
||||
"For `sigmoied_threshold`, it is the threshold \tau against which the (sigmoied) scores are compared."
|
||||
"Not needed for `l0`",
|
||||
help=(
|
||||
"For `magnitude` and `topK`, it is the level of remaining weights (in %) in the fine-pruned model."
|
||||
"For `sigmoied_threshold`, it is the threshold \tau against which the (sigmoied) scores are compared."
|
||||
"Not needed for `l0`"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
|
||||
@@ -70,15 +70,20 @@ if __name__ == "__main__":
|
||||
choices=["l0", "topK", "sigmoied_threshold"],
|
||||
type=str,
|
||||
required=True,
|
||||
help="Pruning Method (l0 = L0 regularization, topK = Movement pruning, sigmoied_threshold = Soft movement pruning)",
|
||||
help=(
|
||||
"Pruning Method (l0 = L0 regularization, topK = Movement pruning, sigmoied_threshold = Soft movement"
|
||||
" pruning)"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--threshold",
|
||||
type=float,
|
||||
required=False,
|
||||
help="For `topK`, it is the level of remaining weights (in %) in the fine-pruned model."
|
||||
"For `sigmoied_threshold`, it is the threshold \tau against which the (sigmoied) scores are compared."
|
||||
"Not needed for `l0`",
|
||||
help=(
|
||||
"For `topK`, it is the level of remaining weights (in %) in the fine-pruned model."
|
||||
"For `sigmoied_threshold`, it is the threshold \tau against which the (sigmoied) scores are compared."
|
||||
"Not needed for `l0`"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--serialization_dir",
|
||||
|
||||
@@ -80,8 +80,8 @@ class BertSelfAttention(nn.Module):
|
||||
super().__init__()
|
||||
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
|
||||
raise ValueError(
|
||||
"The hidden size (%d) is not a multiple of the number of attention "
|
||||
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
|
||||
"The hidden size (%d) is not a multiple of the number of attention heads (%d)"
|
||||
% (config.hidden_size, config.num_attention_heads)
|
||||
)
|
||||
self.output_attentions = config.output_attentions
|
||||
|
||||
|
||||
@@ -622,8 +622,10 @@ def main():
|
||||
"--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded.",
|
||||
help=(
|
||||
"The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
@@ -669,22 +671,29 @@ def main():
|
||||
"--initial_warmup",
|
||||
default=1,
|
||||
type=int,
|
||||
help="Run `initial_warmup` * `warmup_steps` steps of threshold warmup during which threshold stays"
|
||||
"at its `initial_threshold` value (sparsity schedule).",
|
||||
help=(
|
||||
"Run `initial_warmup` * `warmup_steps` steps of threshold warmup during which threshold stays"
|
||||
"at its `initial_threshold` value (sparsity schedule)."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--final_warmup",
|
||||
default=2,
|
||||
type=int,
|
||||
help="Run `final_warmup` * `warmup_steps` steps of threshold cool-down during which threshold stays"
|
||||
"at its final_threshold value (sparsity schedule).",
|
||||
help=(
|
||||
"Run `final_warmup` * `warmup_steps` steps of threshold cool-down during which threshold stays"
|
||||
"at its final_threshold value (sparsity schedule)."
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--pruning_method",
|
||||
default="topK",
|
||||
type=str,
|
||||
help="Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning, sigmoied_threshold = Soft movement pruning).",
|
||||
help=(
|
||||
"Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning,"
|
||||
" sigmoied_threshold = Soft movement pruning)."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mask_init",
|
||||
@@ -717,7 +726,10 @@ def main():
|
||||
"--teacher_type",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.",
|
||||
help=(
|
||||
"Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for"
|
||||
" distillation."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--teacher_name_or_path",
|
||||
@@ -787,8 +799,10 @@ def main():
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
help=(
|
||||
"For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html"
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
|
||||
@@ -805,7 +819,8 @@ def main():
|
||||
and not args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
|
||||
f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to"
|
||||
" overcome."
|
||||
)
|
||||
|
||||
# Setup CUDA, GPU & distributed training
|
||||
|
||||
@@ -737,8 +737,10 @@ def main():
|
||||
"--max_seq_length",
|
||||
default=384,
|
||||
type=int,
|
||||
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
|
||||
"longer than this will be truncated, and sequences shorter than this will be padded.",
|
||||
help=(
|
||||
"The maximum total input sequence length after WordPiece tokenization. Sequences "
|
||||
"longer than this will be truncated, and sequences shorter than this will be padded."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--doc_stride",
|
||||
@@ -750,8 +752,10 @@ def main():
|
||||
"--max_query_length",
|
||||
default=64,
|
||||
type=int,
|
||||
help="The maximum number of tokens for the question. Questions longer than this will "
|
||||
"be truncated to this length.",
|
||||
help=(
|
||||
"The maximum number of tokens for the question. Questions longer than this will "
|
||||
"be truncated to this length."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
||||
@@ -785,22 +789,29 @@ def main():
|
||||
"--initial_warmup",
|
||||
default=1,
|
||||
type=int,
|
||||
help="Run `initial_warmup` * `warmup_steps` steps of threshold warmup during which threshold stays"
|
||||
"at its `initial_threshold` value (sparsity schedule).",
|
||||
help=(
|
||||
"Run `initial_warmup` * `warmup_steps` steps of threshold warmup during which threshold stays"
|
||||
"at its `initial_threshold` value (sparsity schedule)."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--final_warmup",
|
||||
default=2,
|
||||
type=int,
|
||||
help="Run `final_warmup` * `warmup_steps` steps of threshold cool-down during which threshold stays"
|
||||
"at its final_threshold value (sparsity schedule).",
|
||||
help=(
|
||||
"Run `final_warmup` * `warmup_steps` steps of threshold cool-down during which threshold stays"
|
||||
"at its final_threshold value (sparsity schedule)."
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--pruning_method",
|
||||
default="topK",
|
||||
type=str,
|
||||
help="Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning, sigmoied_threshold = Soft movement pruning).",
|
||||
help=(
|
||||
"Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning,"
|
||||
" sigmoied_threshold = Soft movement pruning)."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mask_init",
|
||||
@@ -833,7 +844,10 @@ def main():
|
||||
"--teacher_type",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.",
|
||||
help=(
|
||||
"Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for"
|
||||
" distillation."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--teacher_name_or_path",
|
||||
@@ -883,20 +897,27 @@ def main():
|
||||
"--max_answer_length",
|
||||
default=30,
|
||||
type=int,
|
||||
help="The maximum length of an answer that can be generated. This is needed because the start "
|
||||
"and end predictions are not conditioned on one another.",
|
||||
help=(
|
||||
"The maximum length of an answer that can be generated. This is needed because the start "
|
||||
"and end predictions are not conditioned on one another."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose_logging",
|
||||
action="store_true",
|
||||
help="If true, all of the warnings related to data processing will be printed. "
|
||||
"A number of warnings are expected for a normal SQuAD evaluation.",
|
||||
help=(
|
||||
"If true, all of the warnings related to data processing will be printed. "
|
||||
"A number of warnings are expected for a normal SQuAD evaluation."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lang_id",
|
||||
default=0,
|
||||
type=int,
|
||||
help="language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)",
|
||||
help=(
|
||||
"language id of input for language-specific xlm models (see"
|
||||
" tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)"
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
|
||||
@@ -925,8 +946,10 @@ def main():
|
||||
"--fp16_opt_level",
|
||||
type=str,
|
||||
default="O1",
|
||||
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html",
|
||||
help=(
|
||||
"For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
||||
"See details at https://nvidia.github.io/apex/amp.html"
|
||||
),
|
||||
)
|
||||
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
|
||||
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
|
||||
|
||||
Reference in New Issue
Block a user