Override build_inputs_with_special_tokens for fast tokenizers (#2912)

* Override build_inputs_with_special_tokens for fast impl + unittest. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Quality + format. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
2020-02-19 22:09:51 +01:00
parent 59c23ad9c9
commit e676764241
4 changed files with 68 additions and 0 deletions
--- a/src/transformers/tokenization_roberta.py
+++ b/src/transformers/tokenization_roberta.py
@@ -210,3 +210,10 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
        # We need to recompute max_len according to the newly register post_processor to get real values.
        self.max_len_single_sentence = self.max_len - self.num_added_tokens(False)  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - self.num_added_tokens(True)  # take into account special tokens
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+        if token_ids_1 is None:
+            return output
+
+        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]