Fixing a pathological case for slow tokenizers (#14981)

* Fixing a pathological case for slow tokenizers * Update src/transformers/tokenization_utils.py
2021-12-30 09:10:34 +01:00
parent d1ba56d8d8
commit d7d60df0ec
2 changed files with 14 additions and 2 deletions
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -3687,6 +3687,13 @@ class TrieTest(unittest.TestCase):
        trie.add("C")
        self.assertEqual(trie.split("ABC"), ["AB", "C"])

+    def test_trie_skip(self):
+        trie = Trie()
+        trie.add("ABC")
+        trie.add("B")
+        trie.add("CD")
+        self.assertEqual(trie.split("ABCD"), ["ABC", "D"])
+
    def test_cut_text_hardening(self):
        # Even if the offsets are wrong, we necessarily output correct string
        # parts.