From 3b9a1dc13209d0cab347bf2363d18963cc3f9194 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 6 Feb 2023 08:36:12 -0800 Subject: [PATCH] [examples] improve block_size warning message (#21463) --- examples/pytorch/language-modeling/run_clm.py | 5 +++-- examples/pytorch/language-modeling/run_clm_no_trainer.py | 5 +++-- examples/pytorch/language-modeling/run_mlm.py | 5 +++-- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 5 +++-- examples/pytorch/multiple-choice/run_swag.py | 5 +++-- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 12adcdae1b..2a20e2ef45 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -459,8 +459,9 @@ def main(): block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." + "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" + " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" + " override this default with `--block_size xxx`." ) block_size = 1024 else: diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 8c8142387a..1592a72897 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -407,8 +407,9 @@ def main(): block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." + "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" + " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" + " override this default with `--block_size xxx`." ) block_size = 1024 else: diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 16dc11abf2..9639af9513 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -414,8 +414,9 @@ def main(): max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." + "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" + " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" + " override this default with `--block_size xxx`." ) max_seq_length = 1024 else: diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 613af86525..12d2bbcb54 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -399,8 +399,9 @@ def main(): max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." + "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" + " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" + " override this default with `--block_size xxx`." ) max_seq_length = 1024 else: diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index 2e8572fbcc..a69171766a 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -336,8 +336,9 @@ def main(): max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( - f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." + "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" + " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" + " override this default with `--block_size xxx`." ) max_seq_length = 1024 else: