Fix GPT2 and RoBERTa tokenizer to beging with a space - update Roberta tokenizer

This commit is contained in:
thomwolf
2019-08-30 11:23:49 +02:00
parent 55f69a11b6
commit 0517e7a1cb
3 changed files with 9 additions and 116 deletions

View File

@@ -109,11 +109,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
self.encoder = json.load(open(vocab_file))
self.decoder = {v:k for k,v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
@@ -169,6 +169,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
def _tokenize(self, text):
""" Tokenize a string. """
text = ' ' + text # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
bpe_tokens = []
for token in re.findall(self.pat, text):
if sys.version_info[0] == 2:
@@ -214,4 +215,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
writer.write(' '.join(bpe_tokens) + u'\n')
index += 1
return vocab_file, merge_file
return vocab_file, merge_file