Conversion from slow to fast for BPE spm vocabs contained an error. (#10120)

* Conversion from slow to fast for BPE spm vocabs contained an error. - There is only 1 test currently (tokenizers + slow) that used the modified path and it's reformer, which does not contain any ids modification so the bug was silent for now. - The real issue is that vocab variable was overloaded by SentencePieceExtractor, leading to Slow specific vocab oddities to be completely ignored - The bug was reported here https://github.com/huggingface/transformers/issues/9518 - Ran the complete tokenization test suite with slow without error (`RUN_SLOW=1 pytest -sv tests/test_tokenization_*`) * Remove rebase error. * Adding the fixture.
2021-02-13 14:24:53 +01:00
parent dd3a7f9641
commit c9837a0d27
3 changed files with 28 additions and 3 deletions
--- a/tests/fixtures/test_sentencepiece_bpe.model
+++ b/tests/fixtures/test_sentencepiece_bpe.model
--- a/tests/test_tokenization_camembert.py
+++ b/tests/test_tokenization_camembert.py
@@ -25,6 +25,7 @@ from .test_tokenization_common import TokenizerTesterMixin


 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
+SAMPLE_BPE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece_bpe.model")

 FRAMEWORK = "pt" if is_torch_available() else "tf"

@@ -44,6 +45,28 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
        tokenizer.save_pretrained(self.tmpdirname)

+    def test_rust_and_python_bpe_tokenizers(self):
+        tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+        rust_tokenizer = CamembertTokenizerFast.from_pretrained(self.tmpdirname)
+
+        sequence = "I was born in 92000, and this is falsé."
+
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        # <unk> tokens are not the same for `rust` than for `slow`.
+        # Because spm gives back raw token instead of `unk` in EncodeAsPieces
+        # tokens = tokenizer.tokenize(sequence)
+        tokens = tokenizer.convert_ids_to_tokens(ids)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
    def test_rust_and_python_full_tokenizers(self):
        if not self.test_rust_tokenizer:
            return