Fix misleading RoBERTa token type ids
This commit is contained in:
@@ -144,9 +144,7 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A RoBERTa sequence pair mask has the following format:
|
RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence
|
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
"""
|
"""
|
||||||
@@ -155,4 +153,4 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
|
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
return len(cls + token_ids_0 + sep) * [0]
|
||||||
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
|
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
|
||||||
|
|||||||
Reference in New Issue
Block a user