From acc851e1ff92835d2a3ee9774d9d0abfda6e3f36 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 8 Apr 2021 09:46:28 -0700 Subject: [PATCH] [run_clm] clarify why we get the tokenizer warning on long input (#11145) * clarify why we get the warning here * Update examples/language-modeling/run_clm.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * wording * style Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- examples/language-modeling/run_clm.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py index a49c815e2c..31221dffd5 100755 --- a/examples/language-modeling/run_clm.py +++ b/examples/language-modeling/run_clm.py @@ -43,6 +43,7 @@ from transformers import ( default_data_collator, set_seed, ) +from transformers.testing_utils import CaptureLogger from transformers.trainer_utils import get_last_checkpoint, is_main_process from transformers.utils import check_min_version @@ -317,7 +318,15 @@ def main(): text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): - return tokenizer(examples[text_column_name]) + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." + ) + return output tokenized_datasets = datasets.map( tokenize_function,