From 036483fae538faff62f78448b38787f3adb94f97 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 9 Oct 2019 16:33:15 -0400
Subject: [PATCH] Temporary CTRL tokenizer fix

---
 transformers/tokenization_ctrl.py | 30 +++---------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py
index afe8fa70e3..386a51f85d 100644
--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -108,17 +108,6 @@ class CTRLTokenizer(PreTrainedTokenizer):
         self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
         self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
 
-        try:
-            import ftfy
-            from spacy.lang.en import English
-            _nlp = English()
-            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
-            self.fix_text = ftfy.fix_text
-        except ImportError:
-            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True)
-            self.fix_text = None
-
         self.encoder = json.load(open(vocab_file, encoding="utf-8"))
         self.decoder = {v:k for k,v in self.encoder.items()}
         merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
@@ -177,22 +166,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
         """ Tokenize a string.
         """
         split_tokens = []
-        if self.fix_text is None:
-            # Using BERT's BasicTokenizer
-            text = self.nlp.tokenize(text)
-            for token in text:
-                split_tokens.extend([t for t in self.bpe(token).split(' ')])
-        else:
-            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
-            text = self.nlp(text_standardize(self.fix_text(text)))
-            for token in text:
-                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
-        # for token in text.split():
-        #     if sys.version_info[0] == 2:
-        #         token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
-        #     else:
-        #         token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
-        #     bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        text = text.split(' ')
+        for token in text:
+            split_tokens.extend([t for t in self.bpe(token).split(' ')])
         return split_tokens
 
     def _convert_token_to_id(self, token):