From 07f0bb691d733a93e5eefd104145649810c7ebb0 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 9 Apr 2021 11:39:12 -0700
Subject: [PATCH] [examples run_clm] fix _LazyModule hasher error (#11168)

* fix _LazyModule hasher error

* reword
---
 examples/language-modeling/run_clm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
index 31221dffd5..505f8f68c4 100755
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -317,8 +317,10 @@ def main():
         column_names = datasets["validation"].column_names
     text_column_name = "text" if "text" in column_names else column_names[0]
 
+    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
+    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
+
     def tokenize_function(examples):
-        tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
         with CaptureLogger(tok_logger) as cl:
             output = tokenizer(examples[text_column_name])
         # clm input could be much much longer than block_size