From dfe012ad9d6b6f0c9d30bc508b9f1e4c42280c07 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 7 Jan 2020 13:44:56 +0100 Subject: [PATCH] Fix misleading RoBERTa token type ids --- src/transformers/tokenization_roberta.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py index ed97058021..e14f12f449 100644 --- a/src/transformers/tokenization_roberta.py +++ b/src/transformers/tokenization_roberta.py @@ -144,9 +144,7 @@ class RobertaTokenizer(GPT2Tokenizer): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. - A RoBERTa sequence pair mask has the following format: - 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence + RoBERTa does not make use of token type ids, therefore a list of zeros is returned. if token_ids_1 is None, only returns the first portion of the mask (0's). """ @@ -155,4 +153,4 @@ class RobertaTokenizer(GPT2Tokenizer): if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]