added tokenizers serialization tests

This commit is contained in:
thomwolf
2019-04-15 12:03:56 +02:00
parent 3e65f255dc
commit 870b734bfd
7 changed files with 51 additions and 32 deletions

View File

@@ -52,5 +52,21 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
vocab_file, merges_file = tokenizer.save_vocabulary(vocab_path="/tmp/")
tokenizer.from_pretrained("/tmp/")
os.remove(vocab_file)
os.remove(merges_file)
text = "lower"
bpe_tokens = ["low", "er</w>"]
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + ["<unk>"]
input_bpe_tokens = [14, 15, 20]
self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
if __name__ == '__main__':
unittest.main()