add a warning in SpmConverter for sentencepiece's model using the byte fallback feature (#16629)

* update proto sentencepiece model

* Revert "update proto sentencepiece model"

This reverts commit b07f671747fec35773d0b3d4788b8b15aefa0229.

* add check

* add test

* Revert "Revert "update proto sentencepiece model""

This reverts commit 46108257b8927b73627ec8f4f3eed53a95fc700d.

* test for log level

* test for log level 2

* warning at the warning level

* clean

* format

* add explanation in docstring
This commit is contained in:
SaulLu
2022-04-11 11:06:10 +02:00
committed by GitHub
parent 7c5d79912a
commit 1025a9b742
4 changed files with 486 additions and 158 deletions

Binary file not shown.

View File

@@ -0,0 +1,36 @@
import unittest
import warnings
from dataclasses import dataclass
from transformers.convert_slow_tokenizer import SpmConverter
from transformers.testing_utils import get_tests_dir
@dataclass
class FakeOriginalTokenizer:
vocab_file: str
class ConvertSlowTokenizerTest(unittest.TestCase):
def test_spm_converter_bytefallback_warning(self):
spm_model_file_without_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece.model"
spm_model_file_with_bytefallback = f"{get_tests_dir()}/fixtures/test_sentencepiece_with_bytefallback.model"
original_tokenizer_without_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_without_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_without_bytefallback)
self.assertEqual(len(w), 0)
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_with_bytefallback)
self.assertEqual(len(w), 1)
self.assertIn(
(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers."
),
str(w[0].message),
)