From 051dcb2a07390c3cd41a88699074981e5480a7d5 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 11 May 2020 13:31:03 -0400 Subject: [PATCH] CamemBERT does not make use of Token Type IDs (#4289) --- src/transformers/tokenization_camembert.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/transformers/tokenization_camembert.py b/src/transformers/tokenization_camembert.py index 4a1069f737..07feb56c16 100644 --- a/src/transformers/tokenization_camembert.py +++ b/src/transformers/tokenization_camembert.py @@ -102,6 +102,7 @@ class CamembertTokenizer(PreTrainedTokenizer): vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] def __init__( self, @@ -200,14 +201,7 @@ class CamembertTokenizer(PreTrainedTokenizer): ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. - A CamemBERT sequence pair mask has the following format: - - :: - - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | | second sequence | - - if token_ids_1 is None, only returns the first portion of the mask (0s). + CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. Args: token_ids_0 (:obj:`List[int]`): @@ -216,15 +210,15 @@ class CamembertTokenizer(PreTrainedTokenizer): Optional second list of IDs for sequence pairs. Returns: - :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given - sequence(s). + :obj:`List[int]`: List of zeros. + """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] @property def vocab_size(self):