From a1ad16a446e0b2cb0023af9fc0a61df9ecd12939 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Wed, 20 Jan 2021 04:17:39 -0500 Subject: [PATCH] Restrain tokenizer.model_max_length default (#9681) * Restrain tokenizer.model_max_length default * Fix indent --- examples/language-modeling/run_mlm.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py index 309e02468c..223b8508fb 100644 --- a/examples/language-modeling/run_mlm.py +++ b/examples/language-modeling/run_mlm.py @@ -338,6 +338,12 @@ def main(): if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length + if max_seq_length > 1024: + logger.warn( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." + ) + max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warn(