From d340e2329e13674e61bbe763ea577769ef1984c8 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 24 Sep 2019 09:09:28 -0400 Subject: [PATCH] create_mask_from_sequences -> create_token_type_ids_from_sequences --- pytorch_transformers/tokenization_bert.py | 2 +- pytorch_transformers/tokenization_distilbert.py | 11 ----------- pytorch_transformers/tokenization_roberta.py | 2 +- pytorch_transformers/tokenization_utils.py | 4 ++-- pytorch_transformers/tokenization_xlm.py | 2 +- pytorch_transformers/tokenization_xlnet.py | 2 +- 6 files changed, 6 insertions(+), 17 deletions(-) diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py index 211b8fe93b..7eca60a140 100644 --- a/pytorch_transformers/tokenization_bert.py +++ b/pytorch_transformers/tokenization_bert.py @@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer): return cls + token_ids_0 + sep + token_ids_1 + sep - def create_mask_from_sequences(self, sequence_0, sequence_1): + def create_token_type_ids_from_sequences(self, sequence_0, sequence_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format: diff --git a/pytorch_transformers/tokenization_distilbert.py b/pytorch_transformers/tokenization_distilbert.py index 0af782beb1..547ea29981 100644 --- a/pytorch_transformers/tokenization_distilbert.py +++ b/pytorch_transformers/tokenization_distilbert.py @@ -67,14 +67,3 @@ class DistilBertTokenizer(BertTokenizer): def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): sep = [self.sep_token_id] return token_ids_0 + sep + token_ids_1 - - def create_mask_from_sequences(self, sequence_0, sequence_1): - """ - Creates a mask from the two sequences passed to be used in a sequence-pair classification task. - A BERT sequence pair mask has the following format: - 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence - """ - sep = [self.sep_token_id] - - return len(self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1)) * [1] diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py index 161c4e6870..475aee47fa 100644 --- a/pytorch_transformers/tokenization_roberta.py +++ b/pytorch_transformers/tokenization_roberta.py @@ -97,7 +97,7 @@ class RobertaTokenizer(GPT2Tokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - def create_mask_from_sequences(self, sequence_0, sequence_1): + def create_token_type_ids_from_sequences(self, sequence_0, sequence_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A RoBERTa sequence pair mask has the following format: diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 02b4bef699..c5efd37a53 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -780,7 +780,7 @@ class PreTrainedTokenizer(object): ) if output_token_type: - information["token_type_ids"] = self.create_mask_from_sequences(text, text_pair) + information["token_type_ids"] = self.create_token_type_ids_from_sequences(text, text_pair) else: logger.warning("No special tokens were added. The two sequences have been concatenated.") sequence = first_sentence_tokens + second_sentence_tokens @@ -863,7 +863,7 @@ class PreTrainedTokenizer(object): return information - def create_mask_from_sequences(self, sequence_0, sequence_1): + def create_token_type_ids_from_sequences(self, sequence_0, sequence_1): logger.warning("This tokenizer does not make use of special tokens.") return [0] * len(self.encode(sequence_0)) + [1] * len(self.encode(sequence_1)) diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py index 3b177b6a17..833a8d8be6 100644 --- a/pytorch_transformers/tokenization_xlm.py +++ b/pytorch_transformers/tokenization_xlm.py @@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep - def create_mask_from_sequences(self, sequence_0, sequence_1): + def create_token_type_ids_from_sequences(self, sequence_0, sequence_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence pair mask has the following format: diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py index 1464743759..5febf16418 100644 --- a/pytorch_transformers/tokenization_xlnet.py +++ b/pytorch_transformers/tokenization_xlnet.py @@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return token_ids_0 + sep + token_ids_1 + sep + cls - def create_mask_from_sequences(self, sequence_0, sequence_1): + def create_token_type_ids_from_sequences(self, sequence_0, sequence_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format: