Number of added tokens calculator

This commit is contained in:
LysandreJik
2019-09-11 11:20:07 +02:00
parent de8e14b6c0
commit d572d7027b
2 changed files with 43 additions and 0 deletions

View File

@@ -198,3 +198,16 @@ class CommonTestCases:
seq_1 = "With these inputs."
sequences, mask = tokenizer.encode(seq_0, seq_1, add_special_tokens=True, output_mask=True)
assert len(sequences) == len(mask)
def test_number_of_added_tokens(self):
tokenizer = self.get_tokenizer()
seq_0 = "Test this method."
seq_1 = "With these inputs."
sequences = tokenizer.encode(seq_0, seq_1)
attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
# Method is implemented (e.g. not GPT-2)
if len(attached_sequences) != 2:
assert tokenizer.num_added_tokens(pair=True) == len(attached_sequences) - sum([len(seq) for seq in sequences])