Adding support for tokens being suffixes or part of each other. (#13918)

* Adding support for tokens being suffixes or part of each other.

* Better test name.
This commit is contained in:
Nicolas Patry
2021-10-08 10:10:38 +02:00
committed by GitHub
parent 026866df92
commit d70919e6d5
2 changed files with 62 additions and 15 deletions

View File

@@ -3574,3 +3574,24 @@ class TrieTest(unittest.TestCase):
trie.add("TOKEN]")
trie.add("[SPECIAL_TOKEN]")
self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
def test_trie_subtokens(self):
trie = Trie()
trie.add("A")
trie.add("P")
trie.add("[SPECIAL_TOKEN]")
self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
def test_trie_suffix_tokens(self):
trie = Trie()
trie.add("AB")
trie.add("B")
trie.add("C")
self.assertEqual(trie.split("ABC"), ["AB", "C"])
def test_cut_text_hardening(self):
# Even if the offsets are wrong, we necessarily output correct string
# parts.
trie = Trie()
parts = trie.cut_text("ABC", [0, 0, 2, 1, 2, 3])
self.assertEqual(parts, ["AB", "C"])