added tokenizers serialization tests
This commit is contained in:
@@ -263,7 +263,10 @@ class OpenAIGPTTokenizer(object):
|
||||
return out_string
|
||||
|
||||
def save_vocabulary(self, vocab_path):
|
||||
"""Save the tokenizer vocabulary to a path."""
|
||||
"""Save the tokenizer vocabulary and merge files to a directory."""
|
||||
if not os.path.isdir(vocab_path):
|
||||
logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
|
||||
return
|
||||
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
|
||||
merge_file = os.path.join(vocab_path, MERGES_NAME)
|
||||
json.dump(self.encoder, vocab_file)
|
||||
@@ -277,3 +280,4 @@ class OpenAIGPTTokenizer(object):
|
||||
index = token_index
|
||||
writer.write(bpe_tokens + u'\n')
|
||||
index += 1
|
||||
return vocab_file, merge_file
|
||||
|
||||
Reference in New Issue
Block a user