From f8aace6bcd1f72ba962263be3de6876572a366a5 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 30 Aug 2019 13:39:52 +0200 Subject: [PATCH] update tokenizers to use self.XX_token_id instead of converting self.XX_token --- pytorch_transformers/tokenization_bert.py | 6 +++--- pytorch_transformers/tokenization_roberta.py | 6 +++--- pytorch_transformers/tokenization_xlm.py | 6 +++--- pytorch_transformers/tokenization_xlnet.py | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py index 04f35aa466..434898d1aa 100644 --- a/pytorch_transformers/tokenization_bert.py +++ b/pytorch_transformers/tokenization_bert.py @@ -171,15 +171,15 @@ class BertTokenizer(PreTrainedTokenizer): Adds special tokens to the a sequence for sequence classification tasks. A BERT sequence has the following format: [CLS] X [SEP] """ - return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)] + return [self.cls_token_id] + token_ids + [self.sep_token_id] def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): """ Adds special tokens to a sequence pair for sequence classification tasks. A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP] """ - sep = [self._convert_token_to_id(self.sep_token)] - cls = [self._convert_token_to_id(self.cls_token)] + sep = [self.sep_token_id] + cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep def save_vocabulary(self, vocab_path): diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py index f290168c95..7c8b3587a1 100644 --- a/pytorch_transformers/tokenization_roberta.py +++ b/pytorch_transformers/tokenization_roberta.py @@ -86,13 +86,13 @@ class RobertaTokenizer(GPT2Tokenizer): Adds special tokens to a sequence for sequence classification tasks. A RoBERTa sequence has the following format: [CLS] X [SEP] """ - return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)] + return [self.cls_token_id] + token_ids + [self.sep_token_id] def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): """ Adds special tokens to a sequence pair for sequence classification tasks. A RoBERTa sequence pair has the following format: [CLS] A [SEP][SEP] B [SEP] """ - sep = [self._convert_token_to_id(self.sep_token)] - cls = [self._convert_token_to_id(self.cls_token)] + sep = [self.sep_token_id] + cls = [self.cls_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py index 2d2f3a8cd4..ae9fe6c828 100644 --- a/pytorch_transformers/tokenization_xlm.py +++ b/pytorch_transformers/tokenization_xlm.py @@ -220,15 +220,15 @@ class XLMTokenizer(PreTrainedTokenizer): Adds special tokens to a sequence for sequence classification tasks. An XLM sequence has the following format: [CLS] X [SEP] """ - return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)] + return [self.cls_token_id] + token_ids + [self.sep_token_id] def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): """ Adds special tokens to a sequence pair for sequence classification tasks. An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP] """ - sep = [self._convert_token_to_id(self.sep_token)] - cls = [self._convert_token_to_id(self.cls_token)] + sep = [self.sep_token_id] + cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep def save_vocabulary(self, save_directory): diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py index 371b3c9407..b479a2832d 100644 --- a/pytorch_transformers/tokenization_xlnet.py +++ b/pytorch_transformers/tokenization_xlnet.py @@ -182,8 +182,8 @@ class XLNetTokenizer(PreTrainedTokenizer): Adds special tokens to a sequence pair for sequence classification tasks. An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS] """ - sep = [self._convert_token_to_id(self.sep_token)] - cls = [self._convert_token_to_id(self.cls_token)] + sep = [self.sep_token_id] + cls = [self.cls_token_id] return token_ids + sep + cls def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): @@ -191,8 +191,8 @@ class XLNetTokenizer(PreTrainedTokenizer): Adds special tokens to a sequence for sequence classification tasks. An XLNet sequence has the following format: X [SEP][CLS] """ - sep = [self._convert_token_to_id(self.sep_token)] - cls = [self._convert_token_to_id(self.cls_token)] + sep = [self.sep_token_id] + cls = [self.cls_token_id] return token_ids_0 + sep + token_ids_1 + sep + cls def save_vocabulary(self, save_directory):