From 6179f537a3e2c8db472bce964d4f4cb6fdc09204 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 4 Feb 2019 17:41:22 +0100 Subject: [PATCH] clean up tokenization spaces --- pytorch_pretrained_bert/tokenization_openai.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py index aba531caed..616b68db59 100644 --- a/pytorch_pretrained_bert/tokenization_openai.py +++ b/pytorch_pretrained_bert/tokenization_openai.py @@ -225,8 +225,14 @@ class OpenAIGPTTokenizer(object): tokens.append(self.decoder[i]) return tokens - def decode(self, ids, skip_special_tokens=False): + def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False): """Converts a sequence of ids in a string.""" tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens) out_string = ''.join(tokens).replace('', ' ').strip() + if clean_up_tokenization_spaces: + out_string = out_string.replace('', '') + out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ',' + ).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't" + ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m " + ).replace(" 've", "'ve") return out_string