Adding support for tokens being suffixes or part of each other. (#13918)
* Adding support for tokens being suffixes or part of each other. * Better test name.
This commit is contained in:
@@ -3574,3 +3574,24 @@ class TrieTest(unittest.TestCase):
|
||||
trie.add("TOKEN]")
|
||||
trie.add("[SPECIAL_TOKEN]")
|
||||
self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
|
||||
|
||||
def test_trie_subtokens(self):
|
||||
trie = Trie()
|
||||
trie.add("A")
|
||||
trie.add("P")
|
||||
trie.add("[SPECIAL_TOKEN]")
|
||||
self.assertEqual(trie.split("This is something [SPECIAL_TOKEN]"), ["This is something ", "[SPECIAL_TOKEN]"])
|
||||
|
||||
def test_trie_suffix_tokens(self):
|
||||
trie = Trie()
|
||||
trie.add("AB")
|
||||
trie.add("B")
|
||||
trie.add("C")
|
||||
self.assertEqual(trie.split("ABC"), ["AB", "C"])
|
||||
|
||||
def test_cut_text_hardening(self):
|
||||
# Even if the offsets are wrong, we necessarily output correct string
|
||||
# parts.
|
||||
trie = Trie()
|
||||
parts = trie.cut_text("ABC", [0, 0, 2, 1, 2, 3])
|
||||
self.assertEqual(parts, ["AB", "C"])
|
||||
|
||||
Reference in New Issue
Block a user