From 6015f91a5a28548a597f8d24341d089fe04994e8 Mon Sep 17 00:00:00 2001 From: Phuc Van Phan Date: Wed, 4 Oct 2023 21:31:38 +0700 Subject: [PATCH] refactor: change default block_size (#26229) * refactor: change default block_size * fix: return tf to origin * fix: change files to origin * rebase * rebase * rebase * rebase * rebase * rebase * rebase * rebase * refactor: add min block_size to files * reformat: add min block_size for run_clm tf --- examples/pytorch/language-modeling/run_clm.py | 9 ++++----- examples/pytorch/language-modeling/run_clm_no_trainer.py | 9 ++++----- .../jax-projects/model_parallel/run_clm_mp.py | 4 ++-- examples/tensorflow/language-modeling/run_clm.py | 6 +++--- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index a807f7b269..15c9261be4 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -497,13 +497,12 @@ def main(): if data_args.block_size is None: block_size = tokenizer.model_max_length - if block_size > 1024: + if block_size > config.max_position_embeddings: logger.warning( - "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" - " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" - " override this default with `--block_size xxx`." + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx." ) - block_size = 1024 + block_size = min(1024, config.max_position_embeddings) else: if data_args.block_size > tokenizer.model_max_length: logger.warning( diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 7ed6fb56fb..d4e034167e 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -440,13 +440,12 @@ def main(): if args.block_size is None: block_size = tokenizer.model_max_length - if block_size > 1024: + if block_size > config.max_position_embeddings: logger.warning( - "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" - " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" - " override this default with `--block_size xxx`." + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx." ) - block_size = 1024 + block_size = min(1024, config.max_position_embeddings) else: if args.block_size > tokenizer.model_max_length: logger.warning( diff --git a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py index 12594d2202..a6da8729f0 100644 --- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py +++ b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py @@ -362,9 +362,9 @@ def main(): if block_size > config.max_position_embeddings: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." + f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx." ) - block_size = 1024 + block_size = min(1024, config.max_position_embeddings) else: if data_args.block_size > tokenizer.model_max_length: logger.warning( diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py index 20d8bc12f4..4459f82581 100755 --- a/examples/tensorflow/language-modeling/run_clm.py +++ b/examples/tensorflow/language-modeling/run_clm.py @@ -423,12 +423,12 @@ def main(): if data_args.block_size is None: block_size = tokenizer.model_max_length - if block_size > 1024: + if block_size > config.max_position_embeddings: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " - "Picking 1024 instead. You can change that default value by passing --block_size xxx." + f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx." ) - block_size = 1024 + block_size = min(1024, config.max_position_embeddings) else: if data_args.block_size > tokenizer.model_max_length: logger.warning(