From 1e51bb717c04ca4b01a05a7a548e6b550be38628 Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 13 May 2020 14:32:57 +0200 Subject: [PATCH] Fix for #3865. PretrainedTokenizer mapped " do not" into " don't" when .decode(...) is called. Removed the " do not" --> " don't" mapping from clean_up_tokenization(...). (#4024) --- src/transformers/tokenization_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index e929d0273c..b7f8e755e8 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -2195,7 +2195,6 @@ class PreTrainedTokenizer(SpecialTokensMixin): .replace(" ' ", "'") .replace(" n't", "n't") .replace(" 'm", "'m") - .replace(" do not", " don't") .replace(" 's", "'s") .replace(" 've", "'ve") .replace(" 're", "'re")