GPT-2 option to avoid predicting special tokens

This commit is contained in:
thomwolf
2019-05-07 16:25:53 +02:00
parent e211785ada
commit d1b6979aa5
2 changed files with 17 additions and 13 deletions

View File

@@ -263,8 +263,8 @@ class GPT2Tokenizer(object):
def encode(self, text):
return self.convert_tokens_to_ids(self.tokenize(text))
def decode(self, tokens):
text = ''.join([self.decoder[token] for token in tokens])
def decode(self, tokens, skip_special_tokens=False):
text = ''.join(self.convert_ids_to_tokens(tokens, skip_special_tokens=skip_special_tokens))
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
return text