Preserve spaces in GPT-2 tokenizers (#2778)

* Preserve spaces in GPT-2 tokenizers

Preserves spaces after special tokens in GPT-2 and inhereted (RoBERTa)
tokenizers, enabling correct BPE encoding. Automatically inserts a space
in front of first token in encode function when adding special tokens.

* Add tokenization preprocessing method

* Add framework argument to pipeline factory

Also fixes pipeline test issue. Each test input now treated as a
distinct sequence.
This commit is contained in:
Joe Davison
2020-02-13 13:29:43 -05:00
committed by GitHub
parent 0ed630f139
commit f1e8a51f08
7 changed files with 141 additions and 41 deletions

View File

@@ -110,3 +110,41 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
assert encoded_sentence == encoded_text_from_decode
assert encoded_pair == encoded_pair_from_decode
def test_space_encoding(self):
tokenizer = self.get_tokenizer()
sequence = "Encode this sequence."
space_encoding = tokenizer.byte_encoder[" ".encode("utf-8")[0]]
# Testing encoder arguments
encoded = tokenizer.encode(sequence, add_special_tokens=False)
first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
self.assertNotEqual(first_char, space_encoding)
encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
self.assertEqual(first_char, space_encoding)
tokenizer.add_special_tokens({"bos_token": "<s>"})
encoded = tokenizer.encode(sequence, add_special_tokens=True)
first_char = tokenizer.convert_ids_to_tokens(encoded[1])[0]
self.assertEqual(first_char, space_encoding)
# Testing spaces after special tokenss
mask = "<mask>"
tokenizer.add_special_tokens({"mask_token": mask})
mask_ind = tokenizer.convert_tokens_to_ids(mask)
sequence = "Encode <mask> sequence"
sequence_nospace = "Encode <mask>sequence"
encoded = tokenizer.encode(sequence)
mask_loc = encoded.index(mask_ind)
first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
self.assertEqual(first_char, space_encoding)
encoded = tokenizer.encode(sequence_nospace)
mask_loc = encoded.index(mask_ind)
first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
self.assertNotEqual(first_char, space_encoding)