CamemBERT does not make use of Token Type IDs (#4289)
This commit is contained in:
@@ -102,6 +102,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
|||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
|
model_input_names = ["attention_mask"]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -200,14 +201,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
|||||||
) -> List[int]:
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A CamemBERT sequence pair mask has the following format:
|
CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
|
||||||
|
|
||||||
::
|
|
||||||
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | | second sequence |
|
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_ids_0 (:obj:`List[int]`):
|
token_ids_0 (:obj:`List[int]`):
|
||||||
@@ -216,15 +210,15 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
|||||||
Optional second list of IDs for sequence pairs.
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
|
:obj:`List[int]`: List of zeros.
|
||||||
sequence(s).
|
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
return len(cls + token_ids_0 + sep) * [0]
|
||||||
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
|
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def vocab_size(self):
|
||||||
|
|||||||
Reference in New Issue
Block a user