From bf503158c558d11cf6ab6ce8cf2e91e3e81b479a Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 19 Sep 2019 09:55:36 +0200 Subject: [PATCH] Sentence -> Sequence. Removed output_mask from the special token addition methods. --- examples/run_lm_finetuning.py | 2 +- .../tests/tokenization_bert_test.py | 4 +-- .../tests/tokenization_distilbert_test.py | 4 +-- .../tests/tokenization_roberta_test.py | 4 +-- .../tests/tokenization_tests_commons.py | 28 +++++++++---------- .../tests/tokenization_xlm_test.py | 4 +-- .../tests/tokenization_xlnet_test.py | 4 +-- pytorch_transformers/tokenization_bert.py | 13 +++------ .../tokenization_distilbert.py | 4 +-- pytorch_transformers/tokenization_roberta.py | 12 ++------ pytorch_transformers/tokenization_utils.py | 21 ++++++-------- pytorch_transformers/tokenization_xlm.py | 13 ++------- pytorch_transformers/tokenization_xlnet.py | 12 ++------ 13 files changed, 49 insertions(+), 76 deletions(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 974a84c34e..556cce6753 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -75,7 +75,7 @@ class TextDataset(Dataset): tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) while len(tokenized_text) >= block_size: # Truncate in block of block_size - self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size])) + self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[:block_size])) tokenized_text = tokenized_text[block_size:] # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py index 1111683ecc..4cfdfed136 100644 --- a/pytorch_transformers/tests/tokenization_bert_test.py +++ b/pytorch_transformers/tests/tokenization_bert_test.py @@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") - encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) - encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) + encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) + encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) assert encoded_sentence == [101] + text + [102] assert encoded_pair == [101] + text + [102] + text_2 + [102] diff --git a/pytorch_transformers/tests/tokenization_distilbert_test.py b/pytorch_transformers/tests/tokenization_distilbert_test.py index 30160fc98e..ef3cac10be 100644 --- a/pytorch_transformers/tests/tokenization_distilbert_test.py +++ b/pytorch_transformers/tests/tokenization_distilbert_test.py @@ -36,8 +36,8 @@ class DistilBertTokenizationTest(BertTokenizationTest): text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") - encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) - encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) + encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) + encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) assert encoded_sentence == text assert encoded_pair == text + [102] + text_2 diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py index 8add2529a5..ba2651c7f2 100644 --- a/pytorch_transformers/tests/tokenization_roberta_test.py +++ b/pytorch_transformers/tests/tokenization_roberta_test.py @@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True) encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True) - encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) - encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) + encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) + encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) assert encoded_sentence == encoded_text_from_decode assert encoded_pair == encoded_pair_from_decode diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py index 4f85e2bdd6..e16891a5db 100644 --- a/pytorch_transformers/tests/tokenization_tests_commons.py +++ b/pytorch_transformers/tests/tokenization_tests_commons.py @@ -187,18 +187,18 @@ class CommonTestCases: for weights_list_2 in weights_lists_2: self.assertListEqual(weights_list, weights_list_2) - def test_mask_output(self): - if sys.version_info <= (3, 0): - return - - tokenizer = self.get_tokenizer() - - if tokenizer.add_special_tokens_sentences_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer": - seq_0 = "Test this method." - seq_1 = "With these inputs." - information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True) - sequences, mask = information["sequence"], information["mask"] - assert len(sequences) == len(mask) + # def test_mask_output(self): + # if sys.version_info <= (3, 0): + # return + # + # tokenizer = self.get_tokenizer() + # + # if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer": + # seq_0 = "Test this method." + # seq_1 = "With these inputs." + # information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True) + # sequences, mask = information["sequence"], information["mask"] + # assert len(sequences) == len(mask) def test_number_of_added_tokens(self): tokenizer = self.get_tokenizer() @@ -228,7 +228,7 @@ class CommonTestCases: assert len(overflowing_tokens) == 2 assert len(truncated_sequence) == total_length - 2 - assert truncated_sequence == tokenizer.add_special_tokens_single_sentence(sequence[:-2]) + assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2]) def test_maximum_encoding_length_pair_input(self): tokenizer = self.get_tokenizer() @@ -237,7 +237,7 @@ class CommonTestCases: seq_1 = "This is another sentence to be encoded." sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) - truncated_second_sequence = tokenizer.add_special_tokens_sentences_pair( + truncated_second_sequence = tokenizer.add_special_tokens_sequence_pair( tokenizer.encode(seq_0), tokenizer.encode(seq_1)[:-2] ) diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py index 43f1e0c5dd..13fdb4a8bb 100644 --- a/pytorch_transformers/tests/tokenization_xlm_test.py +++ b/pytorch_transformers/tests/tokenization_xlm_test.py @@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") - encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) - encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) + encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) + encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) assert encoded_sentence == [1] + text + [1] assert encoded_pair == [1] + text + [1] + text_2 + [1] diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py index c603ce55f9..a6e9f23fe7 100644 --- a/pytorch_transformers/tests/tokenization_xlnet_test.py +++ b/pytorch_transformers/tests/tokenization_xlnet_test.py @@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester): text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") - encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) - encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) + encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) + encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) assert encoded_sentence == text + [4, 3] assert encoded_pair == text + [4] + text_2 + [4, 3] diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py index f720d530f8..0aae2d6a5f 100644 --- a/pytorch_transformers/tokenization_bert.py +++ b/pytorch_transformers/tokenization_bert.py @@ -187,27 +187,22 @@ class BertTokenizer(PreTrainedTokenizer): out_string = ' '.join(tokens).replace(' ##', '').strip() return out_string - def add_special_tokens_single_sentence(self, token_ids): + def add_special_tokens_single_sequence(self, token_ids): """ Adds special tokens to the a sequence for sequence classification tasks. A BERT sequence has the following format: [CLS] X [SEP] """ return [self.cls_token_id] + token_ids + [self.sep_token_id] - def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1, output_mask=False): + def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): """ Adds special tokens to a sequence pair for sequence classification tasks. A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP] """ sep = [self.sep_token_id] cls = [self.cls_token_id] - if output_mask: - return ( - cls + token_ids_0 + sep + token_ids_1 + sep, - [0] * len(cls + token_ids_0 + sep) + [1] * len(token_ids_1 + sep) - ) - else: - return cls + token_ids_0 + sep + token_ids_1 + sep + + return cls + token_ids_0 + sep + token_ids_1 + sep def save_vocabulary(self, vocab_path): """Save the tokenizer vocabulary to a directory or file.""" diff --git a/pytorch_transformers/tokenization_distilbert.py b/pytorch_transformers/tokenization_distilbert.py index f91989d2bd..ce2ace35d8 100644 --- a/pytorch_transformers/tokenization_distilbert.py +++ b/pytorch_transformers/tokenization_distilbert.py @@ -61,10 +61,10 @@ class DistilBertTokenizer(BertTokenizer): pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def add_special_tokens_single_sentence(self, token_ids): + def add_special_tokens_single_sequence(self, token_ids): return token_ids - def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1, output_mask=False): + def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1, output_mask=False): sep = [self.sep_token_id] if output_mask: return ( diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py index b41d85e7e2..b39904711a 100644 --- a/pytorch_transformers/tokenization_roberta.py +++ b/pytorch_transformers/tokenization_roberta.py @@ -81,24 +81,18 @@ class RobertaTokenizer(GPT2Tokenizer): sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, **kwargs) - def add_special_tokens_single_sentence(self, token_ids): + def add_special_tokens_single_sequence(self, token_ids): """ Adds special tokens to a sequence for sequence classification tasks. A RoBERTa sequence has the following format: X """ return [self.cls_token_id] + token_ids + [self.sep_token_id] - def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1, output_mask=False): + def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): """ Adds special tokens to a sequence pair for sequence classification tasks. A RoBERTa sequence pair has the following format: A B """ sep = [self.sep_token_id] cls = [self.cls_token_id] - if output_mask: - return ( - cls + token_ids_0 + sep + sep + token_ids_1 + sep, - [0] * len(cls + token_ids_0 + sep + sep) + [1] * len(token_ids_1 + sep) - ) - else: - return cls + token_ids_0 + sep + sep + token_ids_1 + sep + return cls + token_ids_0 + sep + sep + token_ids_1 + sep diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 2a31aec887..185ca09576 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -708,7 +708,7 @@ class PreTrainedTokenizer(object): if text_pair is None: if add_special_tokens: sequence_tokens = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) - return self.add_special_tokens_single_sentence(sequence_tokens) + return self.add_special_tokens_single_sequence(sequence_tokens) else: ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) return ids @@ -717,7 +717,7 @@ class PreTrainedTokenizer(object): second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)] if add_special_tokens: - return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens) + return self.add_special_tokens_sequence_pair(first_sentence_tokens, second_sentence_tokens) else: logger.warning("No special tokens were added. The two sequences have been concatenated.") return first_sentence_tokens + second_sentence_tokens @@ -747,7 +747,7 @@ class PreTrainedTokenizer(object): if max_length: information["overflowing_tokens"] = sequence_tokens[max_length - n_added_tokens:] sequence_tokens = sequence_tokens[:max_length - n_added_tokens] - sequence = self.add_special_tokens_single_sentence(sequence_tokens) + sequence = self.add_special_tokens_single_sequence(sequence_tokens) else: sequence_tokens = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) if max_length: @@ -774,16 +774,13 @@ class PreTrainedTokenizer(object): information["overflowing_tokens"] = second_sentence_tokens[max_length - f_len - n_added_tokens:] second_sentence_tokens = second_sentence_tokens[:max_length - f_len - n_added_tokens] - encoded_sequence = self.add_special_tokens_sentences_pair( + sequence = self.add_special_tokens_sequence_pair( first_sentence_tokens, - second_sentence_tokens, - output_mask + second_sentence_tokens ) - if output_mask: - sequence, information["mask"] = encoded_sequence - else: - sequence = encoded_sequence + # if output_mask: + # sequence, information["mask"] = encoded_sequence information["sequence"] = sequence else: @@ -800,11 +797,11 @@ class PreTrainedTokenizer(object): return information - def add_special_tokens_single_sentence(self, token_ids): + def add_special_tokens_single_sequence(self, token_ids): logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.") return token_ids - def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1, output_mask=False): + def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") return token_ids_0 + token_ids_1 diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py index 167860f15f..5a29944d10 100644 --- a/pytorch_transformers/tokenization_xlm.py +++ b/pytorch_transformers/tokenization_xlm.py @@ -754,28 +754,21 @@ class XLMTokenizer(PreTrainedTokenizer): out_string = ''.join(tokens).replace('', ' ').strip() return out_string - def add_special_tokens_single_sentence(self, token_ids): + def add_special_tokens_single_sequence(self, token_ids): """ Adds special tokens to a sequence for sequence classification tasks. An XLM sequence has the following format: [CLS] X [SEP] """ return [self.cls_token_id] + token_ids + [self.sep_token_id] - def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1, output_mask=False): + def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): """ Adds special tokens to a sequence pair for sequence classification tasks. An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP] """ sep = [self.sep_token_id] cls = [self.cls_token_id] - - if output_mask: - return ( - cls + token_ids_0 + sep + token_ids_1 + sep, - [0] * len(cls + token_ids_0 + sep) + [1] * len(token_ids_1 + sep) - ) - else: - return cls + token_ids_0 + sep + token_ids_1 + sep + return cls + token_ids_0 + sep + token_ids_1 + sep def save_vocabulary(self, save_directory): """Save the tokenizer vocabulary and merge files to a directory.""" diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py index 9faa3c7e59..fecc35b070 100644 --- a/pytorch_transformers/tokenization_xlnet.py +++ b/pytorch_transformers/tokenization_xlnet.py @@ -181,7 +181,7 @@ class XLNetTokenizer(PreTrainedTokenizer): out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() return out_string - def add_special_tokens_single_sentence(self, token_ids): + def add_special_tokens_single_sequence(self, token_ids): """ Adds special tokens to a sequence pair for sequence classification tasks. An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS] @@ -190,7 +190,7 @@ class XLNetTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return token_ids + sep + cls - def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1, output_mask=False): + def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): """ Adds special tokens to a sequence for sequence classification tasks. An XLNet sequence has the following format: X [SEP][CLS] @@ -199,13 +199,7 @@ class XLNetTokenizer(PreTrainedTokenizer): sep = [self.sep_token_id] cls = [self.cls_token_id] cls_segment_ids = [2] - if output_mask: - return ( - token_ids_0 + sep + token_ids_1 + sep + cls, - [0] * len(token_ids_0 + sep) + [1] * len(token_ids_1 + sep) + cls_segment_ids - ) - else: - return token_ids_0 + sep + token_ids_1 + sep + cls + return token_ids_0 + sep + token_ids_1 + sep + cls def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file