Fix GPT2 and RoBERTa tokenizer to beging with a space - update Roberta tokenizer
This commit is contained in:
@@ -109,11 +109,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
|
||||
super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
|
||||
|
||||
self.encoder = json.load(open(vocab_file))
|
||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||
self.errors = errors # how to handle errors in decoding
|
||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
self.errors = errors # how to handle errors in decoding
|
||||
self.byte_encoder = bytes_to_unicode()
|
||||
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
|
||||
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
||||
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
||||
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
||||
@@ -169,6 +169,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
def _tokenize(self, text):
|
||||
""" Tokenize a string. """
|
||||
text = ' ' + text # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
|
||||
bpe_tokens = []
|
||||
for token in re.findall(self.pat, text):
|
||||
if sys.version_info[0] == 2:
|
||||
@@ -214,4 +215,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
writer.write(' '.join(bpe_tokens) + u'\n')
|
||||
index += 1
|
||||
|
||||
return vocab_file, merge_file
|
||||
return vocab_file, merge_file
|
||||
Reference in New Issue
Block a user