Merge branch 'master' into improved_testing

This commit is contained in:
Thomas Wolf
2019-08-30 13:40:35 +02:00
committed by GitHub
43 changed files with 3103 additions and 56 deletions

View File

@@ -112,6 +112,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v: k for k, v in self.encoder.items()}