Fix convert_token_type_ids_from_sequences for fast tokenizers (#4503)

This commit is contained in:
Anthony MOI
2020-05-22 12:45:10 -04:00
committed by GitHub
parent f7677e1623
commit 35df911485
3 changed files with 69 additions and 0 deletions

View File

@@ -75,6 +75,7 @@ class CommonFastTokenizerTest(unittest.TestCase):
self.assert_special_tokens_map_equal(tokenizer_r, tokenizer_p)
self.assert_embeded_special_tokens(tokenizer_r, tokenizer_p)
self.assert_padding(tokenizer_r, tokenizer_p)
self.assert_create_token_type_ids(tokenizer_r, tokenizer_p)
# TODO: enable for v3.0.0
# self.assert_empty_output_no_special_tokens(tokenizer_r, tokenizer_p)
@@ -308,6 +309,20 @@ class CommonFastTokenizerTest(unittest.TestCase):
self.assertEqual(len(tokens[key].shape), 2)
self.assertEqual(tokens[key].shape[-1], 6)
def assert_create_token_type_ids(self, tokenizer_r, tokenizer_p):
input_simple = [1, 2, 3]
input_pair = [1, 2, 3]
# Generate output
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple)
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple)
self.assertEqual(output_p, output_r)
# Generate pair output
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair)
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair)
self.assertEqual(output_p, output_r)
def assert_build_inputs_with_special_tokens(self, tokenizer_r, tokenizer_p):
# Input string
input_simple = tokenizer_p.tokenize("This is a sample input")