Revert error back into warning for byte fallback conversion. (#22607)
This commit is contained in:
@@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
|
|||||||
allow to make our dependency on SentencePiece optional.
|
allow to make our dependency on SentencePiece optional.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import warnings
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
|
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
|
||||||
@@ -450,7 +451,7 @@ class SpmConverter(Converter):
|
|||||||
|
|
||||||
if self.proto.trainer_spec.byte_fallback:
|
if self.proto.trainer_spec.byte_fallback:
|
||||||
if not getattr(self, "handle_byte_fallback", None):
|
if not getattr(self, "handle_byte_fallback", None):
|
||||||
raise RuntimeError(
|
warnings.warn(
|
||||||
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
|
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
|
||||||
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
|
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
|
||||||
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
|
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
|
||||||
|
|||||||
@@ -24,10 +24,12 @@ class ConvertSlowTokenizerTest(unittest.TestCase):
|
|||||||
|
|
||||||
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
|
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
|
||||||
|
|
||||||
with self.assertRaises(RuntimeError) as cm:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
_ = SpmConverter(original_tokenizer_with_bytefallback)
|
_ = SpmConverter(original_tokenizer_with_bytefallback)
|
||||||
|
self.assertEqual(len(w), 1)
|
||||||
|
|
||||||
self.assertIn(
|
self.assertIn(
|
||||||
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
|
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
|
||||||
" which is not implemented in the fast tokenizers.",
|
" which is not implemented in the fast tokenizers.",
|
||||||
str(cm.exception),
|
str(w[0].message),
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user