From 0aa1153ffbfedc5c647d91a669b63360d1ff8d05 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 6 Apr 2023 14:00:29 +0200 Subject: [PATCH] Revert error back into warning for byte fallback conversion. (#22607) --- src/transformers/convert_slow_tokenizer.py | 3 ++- tests/utils/test_convert_slow_tokenizer.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 50e11dbd86..88706da0f1 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid allow to make our dependency on SentencePiece optional. """ +import warnings from typing import Dict, List, Tuple from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors @@ -450,7 +451,7 @@ class SpmConverter(Converter): if self.proto.trainer_spec.byte_fallback: if not getattr(self, "handle_byte_fallback", None): - raise RuntimeError( + warnings.warn( "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option" " which is not implemented in the fast tokenizers. In practice this means that the fast version of the" " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these " diff --git a/tests/utils/test_convert_slow_tokenizer.py b/tests/utils/test_convert_slow_tokenizer.py index c76fe6f1f9..edeb06c390 100644 --- a/tests/utils/test_convert_slow_tokenizer.py +++ b/tests/utils/test_convert_slow_tokenizer.py @@ -24,10 +24,12 @@ class ConvertSlowTokenizerTest(unittest.TestCase): original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback) - with self.assertRaises(RuntimeError) as cm: + with warnings.catch_warnings(record=True) as w: _ = SpmConverter(original_tokenizer_with_bytefallback) + self.assertEqual(len(w), 1) + self.assertIn( "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option" " which is not implemented in the fast tokenizers.", - str(cm.exception), + str(w[0].message), )