From f5bcde0b2fcfab961580129ec27ac72f43eaa267 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Mon, 30 Sep 2019 16:04:55 -0400 Subject: [PATCH 01/11] [multiple-choice] Simplify and use tokenizer.encode_plus --- examples/README.md | 8 +- examples/run_multiple_choice.py | 16 +-- examples/utils_multiple_choice.py | 166 ++++++++++------------------- transformers/tokenization_utils.py | 68 ++++++++---- 4 files changed, 116 insertions(+), 142 deletions(-) diff --git a/examples/README.md b/examples/README.md index fb5de20a2a..382d794fcb 100644 --- a/examples/README.md +++ b/examples/README.md @@ -9,7 +9,7 @@ similar API between the different models. | [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. | | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. | | [SQuAD](#squad) | Using BERT for question answering, examples with distributed training. | -| [Multiple Choice](#multiple choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. +| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. ## Language model fine-tuning @@ -283,17 +283,17 @@ The results are the following: loss = 0.04755385363816904 ``` -##Multiple Choice +## Multiple Choice Based on the script [`run_multiple_choice.py`](). #### Fine-tuning on SWAG Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data -``` +```bash #training on 4 tesla V100(16GB) GPUS export SWAG_DIR=/path/to/swag_data_dir -python ./examples/single_model_scripts/run_multiple_choice.py \ +python ./examples/run_multiple_choice.py \ --model_type roberta \ --task_name swag \ --model_name_or_path roberta-base \ diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py index 54f3a8a904..2ce1327c98 100644 --- a/examples/run_multiple_choice.py +++ b/examples/run_multiple_choice.py @@ -306,14 +306,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): else: examples = processor.get_train_examples(args.data_dir) logger.info("Training number: %s", str(len(examples))) - features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, - cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end - cls_token=tokenizer.cls_token, - sep_token=tokenizer.sep_token, - sep_token_extra=bool(args.model_type in ['roberta']), - cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, + features = convert_examples_to_features( + examples, + label_list, + args.max_seq_length, + tokenizer, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet - pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) + pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0 + ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) @@ -362,7 +362,7 @@ def main(): help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set') parser.add_argument("--evaluate_during_training", action='store_true', - help="Rul evaluation during training at each logging step.") + help="Run evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py index 7abcc5e1e9..50bb491243 100644 --- a/examples/utils_multiple_choice.py +++ b/examples/utils_multiple_choice.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" BERT multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """ +""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """ from __future__ import absolute_import, division, print_function @@ -26,6 +26,8 @@ import json import csv import glob import tqdm +from typing import List +from transformers import PreTrainedTokenizer logger = logging.getLogger(__name__) @@ -34,13 +36,13 @@ logger = logging.getLogger(__name__) class InputExample(object): """A single training/test example for multiple choice""" - def __init__(self, example_id, question, contexts, endings, label=None): + def __init__(self, example_id, question, contexts, endings, label=None): """Constructs a InputExample. Args: example_id: Unique id for the example. contexts: list of str. The untokenized text of the first sequence (context of corresponding question). - question: string. The untokenized text of the second sequence (qustion). + question: string. The untokenized text of the second sequence (question). endings: list of str. multiple choice's options. Its length must be equal to contexts' length. label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. @@ -66,7 +68,7 @@ class InputFeatures(object): 'input_mask': input_mask, 'segment_ids': segment_ids } - for _, input_ids, input_mask, segment_ids in choices_features + for input_ids, input_mask, segment_ids in choices_features ] self.label = label @@ -192,7 +194,7 @@ class SwagProcessor(DataProcessor): return lines - def _create_examples(self, lines, type): + def _create_examples(self, lines: List[List[str]], type: str): """Creates examples for the training and dev sets.""" if type == "train" and lines[0][-1] != 'label': raise ValueError( @@ -300,24 +302,18 @@ class ArcProcessor(DataProcessor): return examples -def convert_examples_to_features(examples, label_list, max_seq_length, - tokenizer, - cls_token_at_end=False, - cls_token='[CLS]', - cls_token_segment_id=1, - sep_token='[SEP]', - sequence_a_segment_id=0, - sequence_b_segment_id=1, - sep_token_extra=False, - pad_token_segment_id=0, - pad_on_left=False, - pad_token=0, - mask_padding_with_zero=True): - """ Loads a data file into a list of `InputBatch`s - `cls_token_at_end` define the location of the CLS token: - - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) +def convert_examples_to_features( + examples: List[InputExample], + label_list: List[str], + max_length: int, + tokenizer: PreTrainedTokenizer, + pad_token_segment_id=0, + pad_on_left=False, + pad_token=0, + mask_padding_with_zero=True, +) -> List[InputFeatures]: + """ + Loads a data file into a list of `InputFeatures` """ label_map = {label : i for i, label in enumerate(label_list)} @@ -328,125 +324,71 @@ def convert_examples_to_features(examples, label_list, max_seq_length, logger.info("Writing example %d of %d" % (ex_index, len(examples))) choices_features = [] for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)): - tokens_a = tokenizer.tokenize(context) - tokens_b = None + text_a = context if example.question.find("_") != -1: - #this is for cloze question - tokens_b = tokenizer.tokenize(example.question.replace("_", ending)) + # this is for cloze question + text_b = example.question.replace("_", ending) else: - tokens_b = tokenizer.tokenize(example.question + " " + ending) - # you can add seq token between quesiotn and ending. This does not make too much difference. - # tokens_b = tokenizer.tokenize(example.question) - # tokens_b += [sep_token] - # if sep_token_extra: - # tokens_b += [sep_token] - # tokens_b += tokenizer.tokenize(ending) + text_b = example.question + " " + ending - special_tokens_count = 4 if sep_token_extra else 3 - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) + inputs = tokenizer.encode_plus( + text_a, + text_b, + add_special_tokens=True, + max_length=max_length, + truncate_both_sequences=True + ) + if 'overflowing_tokens' in inputs and len(inputs['overflowing_tokens']) > 0: + logger.info('Attention! you are cropping tokens (swag task is ok). ' + 'If you are training ARC and RACE and you are poping question + options,' + 'you need to try to use a bigger max seq length!') - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = tokens_a + [sep_token] - if sep_token_extra: - # roberta uses an extra separator b/w pairs of sentences - tokens += [sep_token] - - segment_ids = [sequence_a_segment_id] * len(tokens) - - if tokens_b: - tokens += tokens_b + [sep_token] - segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1) - - if cls_token_at_end: - tokens = tokens + [cls_token] - segment_ids = segment_ids + [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - segment_ids = [cls_token_segment_id] + segment_ids - - input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. - padding_length = max_seq_length - len(input_ids) + padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask - segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids + attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask + token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) + + assert len(input_ids) == max_length + assert len(attention_mask) == max_length + assert len(token_type_ids) == max_length + choices_features.append((input_ids, attention_mask, token_type_ids)) + - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - choices_features.append((tokens, input_ids, input_mask, segment_ids)) label = label_map[example.label] if ex_index < 2: logger.info("*** Example ***") logger.info("race_id: {}".format(example.example_id)) - for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features): + for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features): logger.info("choice: {}".format(choice_idx)) - logger.info("tokens: {}".format(' '.join(tokens))) logger.info("input_ids: {}".format(' '.join(map(str, input_ids)))) - logger.info("input_mask: {}".format(' '.join(map(str, input_mask)))) - logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids)))) + logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask)))) + logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids)))) logger.info("label: {}".format(label)) features.append( InputFeatures( - example_id = example.example_id, - choices_features = choices_features, - label = label + example_id=example.example_id, + choices_features=choices_features, + label=label, ) ) return features -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - - # However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger - # length or only pop from context - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - logger.info('Attention! you are removing from token_b (swag task is ok). ' - 'If you are training ARC and RACE (you are poping question + options), ' - 'you need to try to use a bigger max seq length!') - tokens_b.pop() processors = { @@ -456,7 +398,7 @@ processors = { } -GLUE_TASKS_NUM_LABELS = { +MULTIPLE_CHOICE_TASKS_NUM_LABELS = { "race", 4, "swag", 4, "arc", 4 diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 1e20588f83..ea90f33a28 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -699,6 +699,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncate_first_sequence=True, + truncate_both_sequences=False, return_tensors=None, **kwargs): """ @@ -718,7 +719,7 @@ class PreTrainedTokenizer(object): max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. If there are overflowing tokens, those will be added to the returned dictionary stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens - from the main sequence returned. The value of this argument defined the number of additional tokens. + from the main sequence returned. The value of this argument defines the number of additional tokens. truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence will be truncated. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant @@ -731,6 +732,7 @@ class PreTrainedTokenizer(object): add_special_tokens=add_special_tokens, stride=stride, truncate_first_sequence=truncate_first_sequence, + truncate_both_sequences=truncate_both_sequences, return_tensors=return_tensors, **kwargs) @@ -743,6 +745,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncate_first_sequence=True, + truncate_both_sequences=False, return_tensors=None, **kwargs): """ @@ -761,7 +764,7 @@ class PreTrainedTokenizer(object): max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. If there are overflowing tokens, those will be added to the returned dictionary stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens - from the main sequence returned. The value of this argument defined the number of additional tokens. + from the main sequence returned. The value of this argument defines the number of additional tokens. truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence will be truncated. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant @@ -788,11 +791,12 @@ class PreTrainedTokenizer(object): add_special_tokens=add_special_tokens, stride=stride, truncate_first_sequence=truncate_first_sequence, + truncate_both_sequences=truncate_both_sequences, return_tensors=return_tensors) def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, - truncate_first_sequence=True, return_tensors=None): + truncate_first_sequence=True, truncate_both_sequences=True, return_tensors=None): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates @@ -825,21 +829,30 @@ class PreTrainedTokenizer(object): encoded_inputs = {} if max_length: n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0 - if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length: - logger.warning( - "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length." - "This pair of sequences will not be truncated.") - else: - if n_added_tokens + len_ids + len_pair_ids > max_length: - if truncate_first_sequence or not pair: - encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:] - ids = ids[:max_length - len_pair_ids - n_added_tokens] - elif not truncate_first_sequence and pair: - encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:] - pair_ids = pair_ids[:max_length - len_ids - n_added_tokens] - else: - logger.warning( - "Cannot truncate second sequence as it is not provided. No truncation.") + + if n_added_tokens + len_ids + len_pair_ids > max_length: + if truncate_both_sequences: + tokens_a, tokens_b = self._truncate_seq_pair( + copy.deepcopy(ids), + copy.deepcopy(pair_ids), + max_length=max_length - n_added_tokens + ) + encoded_inputs["overflowing_tokens"] = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):] + ids = tokens_a + pair_ids = tokens_b + elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length: + logger.warning( + "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length." + "This pair of sequences will not be truncated.") + elif truncate_first_sequence or not pair: + encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:] + ids = ids[:max_length - len_pair_ids - n_added_tokens] + elif not truncate_first_sequence and pair: + encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:] + pair_ids = pair_ids[:max_length - len_ids - n_added_tokens] + else: + logger.warning( + "Cannot truncate second sequence as it is not provided. No truncation.") if add_special_tokens: sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids) @@ -862,6 +875,25 @@ class PreTrainedTokenizer(object): return encoded_inputs + def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + + # However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger + # length or only pop from context + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + return (tokens_a, tokens_b) + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): logger.warning("This tokenizer does not make use of special tokens.") return [0] * len(token_ids_0) + [1] * len(token_ids_1) From b3506629551b5496aa7aa80923abade04b70f166 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Mon, 30 Sep 2019 16:37:09 -0400 Subject: [PATCH 02/11] overflowing_tokens do not really make sense here, let's just return a number Co-Authored-By: Lysandre Debut --- examples/utils_multiple_choice.py | 2 +- transformers/tokenization_utils.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py index 50bb491243..a7fc1b1222 100644 --- a/examples/utils_multiple_choice.py +++ b/examples/utils_multiple_choice.py @@ -338,7 +338,7 @@ def convert_examples_to_features( max_length=max_length, truncate_both_sequences=True ) - if 'overflowing_tokens' in inputs and len(inputs['overflowing_tokens']) > 0: + if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0: logger.info('Attention! you are cropping tokens (swag task is ok). ' 'If you are training ARC and RACE and you are poping question + options,' 'you need to try to use a bigger max seq length!') diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index ea90f33a28..27b9b0638d 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -837,7 +837,8 @@ class PreTrainedTokenizer(object): copy.deepcopy(pair_ids), max_length=max_length - n_added_tokens ) - encoded_inputs["overflowing_tokens"] = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):] + truncated_tokens = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):] + encoded_inputs["num_truncated_tokens"] = len(truncated_tokens) ids = tokens_a pair_ids = tokens_b elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length: From 7c789c337d9a4f6e9e904d3c1351c28a94183894 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Mon, 30 Sep 2019 10:20:14 -0400 Subject: [PATCH 03/11] Always truncate argument in the encode method --- .../tests/tokenization_tests_commons.py | 17 ++++++++ transformers/tokenization_utils.py | 43 +++++++++++++------ 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index b71ba44436..73a2cb44b3 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -232,6 +232,23 @@ class CommonTestCases: assert len(truncated_sequence) == total_length - 2 assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2]) + def test_always_truncate(self): + tokenizer = self.get_tokenizer() + + seq_0 = "This is a sentence to be encoded." + length_single_sequence = len(tokenizer.encode(seq_0)) + length = len(tokenizer.encode(seq_0, seq_0, add_special_tokens=True)) + + not_truncated = tokenizer.encode(seq_0, seq_0, add_special_tokens=True, max_length=length_single_sequence) + truncated = tokenizer.encode( + seq_0, seq_0, + max_length=length_single_sequence, + add_special_tokens=True, + always_truncate=True + ) + + assert truncated == not_truncated[:length_single_sequence - length] + def test_maximum_encoding_length_pair_input(self): tokenizer = self.get_tokenizer() diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index db9e9cd72e..fc7fe1df67 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -693,14 +693,15 @@ class PreTrainedTokenizer(object): raise NotImplementedError def encode(self, - text, - text_pair=None, - add_special_tokens=False, - max_length=None, - stride=0, - truncate_first_sequence=True, - return_tensors=None, - **kwargs): + text, + text_pair=None, + add_special_tokens=False, + max_length=None, + stride=0, + truncate_first_sequence=True, + return_tensors=None, + always_truncate=False, + **kwargs): """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. @@ -721,6 +722,8 @@ class PreTrainedTokenizer(object): from the main sequence returned. The value of this argument defined the number of additional tokens. truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence will be truncated. + always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the + sequences may be lost in the process. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -732,6 +735,7 @@ class PreTrainedTokenizer(object): stride=stride, truncate_first_sequence=truncate_first_sequence, return_tensors=return_tensors, + always_truncate=always_truncate, **kwargs) return encoded_inputs["input_ids"] @@ -744,6 +748,7 @@ class PreTrainedTokenizer(object): stride=0, truncate_first_sequence=True, return_tensors=None, + always_truncate=False, **kwargs): """ Returns a dictionary containing the encoded sequence or sequence pair and additional informations: @@ -764,6 +769,8 @@ class PreTrainedTokenizer(object): from the main sequence returned. The value of this argument defined the number of additional tokens. truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence will be truncated. + always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the + sequences may be lost in the process. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -788,11 +795,12 @@ class PreTrainedTokenizer(object): add_special_tokens=add_special_tokens, stride=stride, truncate_first_sequence=truncate_first_sequence, + always_truncate=always_truncate, return_tensors=return_tensors) def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, - truncate_first_sequence=True, return_tensors=None): + truncate_first_sequence=True, always_truncate=False, return_tensors=None): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates @@ -812,6 +820,8 @@ class PreTrainedTokenizer(object): truncate_first_sequence: if set to `True` and an optional second list of input ids is provided, alongside a specified `max_length`, will truncate the first sequence if the total size is superior than the specified `max_length`. If set to `False`, will truncate the second sequence instead. + always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the + sequences may be lost in the process. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. @@ -826,9 +836,14 @@ class PreTrainedTokenizer(object): if max_length: n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0 if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length: - logger.warning( - "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length." - "This pair of sequences will not be truncated.") + if always_truncate: + logger.warning( + "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length. " + "This pair of sequences will be truncated but one of the sequences may not be present in the resulting list of ids.") + else: + logger.warning( + "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length. " + "This pair of sequences will not be truncated.") else: if n_added_tokens + len_ids + len_pair_ids > max_length: if truncate_first_sequence or not pair: @@ -860,6 +875,10 @@ class PreTrainedTokenizer(object): encoded_inputs["input_ids"] = sequence encoded_inputs["token_type_ids"] = token_type_ids + if always_truncate and len(encoded_inputs["input_ids"]) > max_length: + encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length] + encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length] + return encoded_inputs def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): From 2f259b228e9901e1bd46b9cb0692a40a9a7e98e7 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Mon, 30 Sep 2019 11:48:18 -0400 Subject: [PATCH 04/11] Sequence IDS --- .../tests/tokenization_tests_commons.py | 30 +++++++++++++++++++ transformers/tokenization_bert.py | 18 +++++++++++ transformers/tokenization_roberta.py | 18 +++++++++++ transformers/tokenization_utils.py | 20 ++++++++++++- transformers/tokenization_xlm.py | 18 +++++++++++ transformers/tokenization_xlnet.py | 18 +++++++++++ 6 files changed, 121 insertions(+), 1 deletion(-) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index 73a2cb44b3..f44be1f97b 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -292,3 +292,33 @@ class CommonTestCases: assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input + + def test_sequence_ids(self): + tokenizer = self.get_tokenizer() + + sequence_0 = "Encode this." + sequence_1 = "This one too please." + + # Testing single inputs + encoded_sequence = tokenizer.encode(sequence_0) + encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + sequence_ids = encoded_sequence_dict["sequence_ids"] + assert len(sequence_ids) == len(encoded_sequence_w_special) + + filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [x for x in filtered_sequence if x is not None] + assert encoded_sequence == filtered_sequence + + # Testing inputs pairs + encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1) + encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + sequence_ids = encoded_sequence_dict["sequence_ids"] + assert len(sequence_ids) == len(encoded_sequence_w_special) + + filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [x for x in filtered_sequence if x is not None] + assert encoded_sequence == filtered_sequence + + diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index 42163cb8ec..b05f88b7a8 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -204,6 +204,24 @@ class BertTokenizer(PreTrainedTokenizer): return cls + token_ids_0 + sep + token_ids_1 + sep + def get_sequence_ids(self, token_ids_0, token_ids_1=None): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + if token_ids_1: + return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0] + else: + return [0] + ([1] * len(token_ids_0)) + [0] + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py index 7adeea689e..c371ae3904 100644 --- a/transformers/tokenization_roberta.py +++ b/transformers/tokenization_roberta.py @@ -100,6 +100,24 @@ class RobertaTokenizer(GPT2Tokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep + def get_sequence_ids(self, token_ids_0, token_ids_1=None): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + if token_ids_1: + return [0] + ([1] * len(token_ids_0)) + [0, 0] + ([1] * len(token_ids_1)) + [0] + else: + return [0] + ([1] * len(token_ids_0)) + [0] + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index fc7fe1df67..5aa7f55730 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -826,7 +826,21 @@ class PreTrainedTokenizer(object): or PyTorch torch.Tensor instead of a list of python integers. Return: - a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given. + A Dictionary of shape:: + + { + input_ids: list[int], + overflowing_tokens: list[int] if a ``max_length`` is specified, else None + sequence_ids: list[int] if ``add_special_tokens`` if set to ``True`` + } + + With the fields: + ``input_ids``: list of tokens to be fed to a model + + ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + + ``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. """ pair = bool(pair_ids is not None) len_ids = len(ids) @@ -859,6 +873,7 @@ class PreTrainedTokenizer(object): if add_special_tokens: sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence) + encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids) else: sequence = ids + pair_ids if pair else ids token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) @@ -893,6 +908,9 @@ class PreTrainedTokenizer(object): logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") return token_ids_0 + token_ids_1 + def get_sequence_ids(self, token_ids_0, token_ids_1=None): + return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): """ Converts a single index or a sequence of indices (integers) in a token " (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens. diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py index f1e49416a4..b6e27ee59e 100644 --- a/transformers/tokenization_xlm.py +++ b/transformers/tokenization_xlm.py @@ -770,6 +770,24 @@ class XLMTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep + def get_sequence_ids(self, token_ids_0, token_ids_1=None): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + if token_ids_1: + return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0] + else: + return [0] + ([1] * len(token_ids_0)) + [0] + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py index ad9efdf043..c8d59decee 100644 --- a/transformers/tokenization_xlnet.py +++ b/transformers/tokenization_xlnet.py @@ -200,6 +200,24 @@ class XLNetTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return token_ids_0 + sep + token_ids_1 + sep + cls + def get_sequence_ids(self, token_ids_0, token_ids_1=None): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + if token_ids_1: + return ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0, 0] + else: + return ([1] * len(token_ids_0)) + [0, 0] + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. From cc412edd42102e6e068b7cbb34d0cf11856d6ae8 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Mon, 30 Sep 2019 14:11:41 -0400 Subject: [PATCH 05/11] Supports already existing special tokens --- transformers/tests/tokenization_tests_commons.py | 12 ++++++++++++ transformers/tokenization_bert.py | 6 +++++- transformers/tokenization_roberta.py | 6 +++++- transformers/tokenization_utils.py | 2 +- transformers/tokenization_xlm.py | 6 +++++- transformers/tokenization_xlnet.py | 6 +++++- 6 files changed, 33 insertions(+), 5 deletions(-) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index f44be1f97b..d656e165aa 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -321,4 +321,16 @@ class CommonTestCases: filtered_sequence = [x for x in filtered_sequence if x is not None] assert encoded_sequence == filtered_sequence + # Testing with already existing special tokens + if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id: + tokenizer.add_special_tokens({'cls_token': '', 'sep_token': ''}) + encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + sequence_ids_orig = encoded_sequence_dict["sequence_ids"] + sequence_ids = tokenizer.get_sequence_ids(encoded_sequence_w_special, special_tokens_present=True) + assert len(sequence_ids) == len(encoded_sequence_w_special) + print(sequence_ids_orig, sequence_ids) + assert sequence_ids_orig == sequence_ids + + diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index b05f88b7a8..72720bfa4e 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer): return cls + token_ids_0 + sep + token_ids_1 + sep - def get_sequence_ids(self, token_ids_0, token_ids_1=None): + def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. @@ -217,6 +217,10 @@ class BertTokenizer(PreTrainedTokenizer): Returns: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ + + if special_tokens_present: + return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0)) + if token_ids_1: return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0] else: diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py index c371ae3904..6f19ee699c 100644 --- a/transformers/tokenization_roberta.py +++ b/transformers/tokenization_roberta.py @@ -100,7 +100,7 @@ class RobertaTokenizer(GPT2Tokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - def get_sequence_ids(self, token_ids_0, token_ids_1=None): + def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. @@ -113,6 +113,10 @@ class RobertaTokenizer(GPT2Tokenizer): Returns: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ + + if special_tokens_present: + return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0)) + if token_ids_1: return [0] + ([1] * len(token_ids_0)) + [0, 0] + ([1] * len(token_ids_1)) + [0] else: diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 5aa7f55730..a0f1382aa2 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -908,7 +908,7 @@ class PreTrainedTokenizer(object): logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") return token_ids_0 + token_ids_1 - def get_sequence_ids(self, token_ids_0, token_ids_1=None): + def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) def convert_ids_to_tokens(self, ids, skip_special_tokens=False): diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py index b6e27ee59e..3f398853dc 100644 --- a/transformers/tokenization_xlm.py +++ b/transformers/tokenization_xlm.py @@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep - def get_sequence_ids(self, token_ids_0, token_ids_1=None): + def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. @@ -783,6 +783,10 @@ class XLMTokenizer(PreTrainedTokenizer): Returns: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ + + if special_tokens_present: + return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0)) + if token_ids_1: return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0] else: diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py index c8d59decee..325a32ef6f 100644 --- a/transformers/tokenization_xlnet.py +++ b/transformers/tokenization_xlnet.py @@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return token_ids_0 + sep + token_ids_1 + sep + cls - def get_sequence_ids(self, token_ids_0, token_ids_1=None): + def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. @@ -213,6 +213,10 @@ class XLNetTokenizer(PreTrainedTokenizer): Returns: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ + + if special_tokens_present: + return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0)) + if token_ids_1: return ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0, 0] else: From 5ed50a93fb4fc4ed554a54835060ab4721123d07 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Mon, 30 Sep 2019 14:14:27 -0400 Subject: [PATCH 06/11] LM finetuning won't mask special tokens anymore --- examples/run_lm_finetuning.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index a91deebb6c..024b254b56 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -108,7 +108,12 @@ def mask_tokens(inputs, tokenizer, args): """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) - masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).bool() + probability_matrix = torch.full(labels.shape, args.mlm_probability) + probability_matrix *= torch.tensor( + [tokenizer.get_sequence_ids(val, special_tokens_present=True) for val in labels.tolist()], + dtype=torch.float + ) + masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -1 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) From 651bfb7ad57387b7d645420f56195b9134fff99f Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Mon, 30 Sep 2019 17:27:40 -0400 Subject: [PATCH 07/11] always_truncate by default --- .../tests/tokenization_tests_commons.py | 18 ------------- transformers/tokenization_utils.py | 26 ++++--------------- 2 files changed, 5 insertions(+), 39 deletions(-) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index d656e165aa..f4805598cd 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -232,23 +232,6 @@ class CommonTestCases: assert len(truncated_sequence) == total_length - 2 assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2]) - def test_always_truncate(self): - tokenizer = self.get_tokenizer() - - seq_0 = "This is a sentence to be encoded." - length_single_sequence = len(tokenizer.encode(seq_0)) - length = len(tokenizer.encode(seq_0, seq_0, add_special_tokens=True)) - - not_truncated = tokenizer.encode(seq_0, seq_0, add_special_tokens=True, max_length=length_single_sequence) - truncated = tokenizer.encode( - seq_0, seq_0, - max_length=length_single_sequence, - add_special_tokens=True, - always_truncate=True - ) - - assert truncated == not_truncated[:length_single_sequence - length] - def test_maximum_encoding_length_pair_input(self): tokenizer = self.get_tokenizer() @@ -329,7 +312,6 @@ class CommonTestCases: sequence_ids_orig = encoded_sequence_dict["sequence_ids"] sequence_ids = tokenizer.get_sequence_ids(encoded_sequence_w_special, special_tokens_present=True) assert len(sequence_ids) == len(encoded_sequence_w_special) - print(sequence_ids_orig, sequence_ids) assert sequence_ids_orig == sequence_ids diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index a0f1382aa2..527ee8422d 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -700,7 +700,6 @@ class PreTrainedTokenizer(object): stride=0, truncate_first_sequence=True, return_tensors=None, - always_truncate=False, **kwargs): """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. @@ -722,8 +721,6 @@ class PreTrainedTokenizer(object): from the main sequence returned. The value of this argument defined the number of additional tokens. truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence will be truncated. - always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the - sequences may be lost in the process. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -735,7 +732,6 @@ class PreTrainedTokenizer(object): stride=stride, truncate_first_sequence=truncate_first_sequence, return_tensors=return_tensors, - always_truncate=always_truncate, **kwargs) return encoded_inputs["input_ids"] @@ -748,7 +744,6 @@ class PreTrainedTokenizer(object): stride=0, truncate_first_sequence=True, return_tensors=None, - always_truncate=False, **kwargs): """ Returns a dictionary containing the encoded sequence or sequence pair and additional informations: @@ -769,8 +764,6 @@ class PreTrainedTokenizer(object): from the main sequence returned. The value of this argument defined the number of additional tokens. truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence will be truncated. - always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the - sequences may be lost in the process. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -795,12 +788,10 @@ class PreTrainedTokenizer(object): add_special_tokens=add_special_tokens, stride=stride, truncate_first_sequence=truncate_first_sequence, - always_truncate=always_truncate, return_tensors=return_tensors) - def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, - truncate_first_sequence=True, always_truncate=False, return_tensors=None): + truncate_first_sequence=True, return_tensors=None): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates @@ -820,8 +811,6 @@ class PreTrainedTokenizer(object): truncate_first_sequence: if set to `True` and an optional second list of input ids is provided, alongside a specified `max_length`, will truncate the first sequence if the total size is superior than the specified `max_length`. If set to `False`, will truncate the second sequence instead. - always_truncate: if set to True, will always truncate the sequences when overflowing, even if one of the - sequences may be lost in the process. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. @@ -850,14 +839,9 @@ class PreTrainedTokenizer(object): if max_length: n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0 if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length: - if always_truncate: - logger.warning( - "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length. " - "This pair of sequences will be truncated but one of the sequences may not be present in the resulting list of ids.") - else: - logger.warning( - "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length. " - "This pair of sequences will not be truncated.") + logger.warning( + "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length. " + "This pair of sequences will be truncated with no regard to the special tokens") else: if n_added_tokens + len_ids + len_pair_ids > max_length: if truncate_first_sequence or not pair: @@ -890,7 +874,7 @@ class PreTrainedTokenizer(object): encoded_inputs["input_ids"] = sequence encoded_inputs["token_type_ids"] = token_type_ids - if always_truncate and len(encoded_inputs["input_ids"]) > max_length: + if max_length and len(encoded_inputs["input_ids"]) > max_length: encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length] encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length] From aebd83230f4a70df386ea2104c21ec46e9320f7f Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 2 Oct 2019 18:04:38 -0400 Subject: [PATCH 08/11] Update naming + remove f string in run_lm_finetuning example --- examples/run_lm_finetuning.py | 4 ++-- .../tests/tokenization_tests_commons.py | 22 +++++++++---------- transformers/tokenization_bert.py | 2 +- transformers/tokenization_roberta.py | 2 +- transformers/tokenization_utils.py | 9 ++++---- transformers/tokenization_xlm.py | 2 +- transformers/tokenization_xlnet.py | 2 +- 7 files changed, 22 insertions(+), 21 deletions(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 024b254b56..585bbc8d75 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -59,7 +59,7 @@ class TextDataset(Dataset): def __init__(self, tokenizer, file_path='train', block_size=512): assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) - cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename)) + cached_features_file = os.path.join(directory, 'cached_lm_' + block_size + '_' + filename) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) @@ -110,7 +110,7 @@ def mask_tokens(inputs, tokenizer, args): # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) probability_matrix *= torch.tensor( - [tokenizer.get_sequence_ids(val, special_tokens_present=True) for val in labels.tolist()], + [tokenizer.get_special_tokens_mask(val, special_tokens_present=True) for val in labels.tolist()], dtype=torch.float ) masked_indices = torch.bernoulli(probability_matrix).bool() diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index f4805598cd..c902347fe5 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -276,7 +276,7 @@ class CommonTestCases: assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input - def test_sequence_ids(self): + def test_special_tokens_mask(self): tokenizer = self.get_tokenizer() sequence_0 = "Encode this." @@ -286,10 +286,10 @@ class CommonTestCases: encoded_sequence = tokenizer.encode(sequence_0) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] - sequence_ids = encoded_sequence_dict["sequence_ids"] - assert len(sequence_ids) == len(encoded_sequence_w_special) + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + assert len(special_tokens_mask) == len(encoded_sequence_w_special) - filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)] filtered_sequence = [x for x in filtered_sequence if x is not None] assert encoded_sequence == filtered_sequence @@ -297,10 +297,10 @@ class CommonTestCases: encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] - sequence_ids = encoded_sequence_dict["sequence_ids"] - assert len(sequence_ids) == len(encoded_sequence_w_special) + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + assert len(special_tokens_mask) == len(encoded_sequence_w_special) - filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)] filtered_sequence = [x for x in filtered_sequence if x is not None] assert encoded_sequence == filtered_sequence @@ -309,10 +309,10 @@ class CommonTestCases: tokenizer.add_special_tokens({'cls_token': '', 'sep_token': ''}) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] - sequence_ids_orig = encoded_sequence_dict["sequence_ids"] - sequence_ids = tokenizer.get_sequence_ids(encoded_sequence_w_special, special_tokens_present=True) - assert len(sequence_ids) == len(encoded_sequence_w_special) - assert sequence_ids_orig == sequence_ids + special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"] + special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, special_tokens_present=True) + assert len(special_tokens_mask) == len(encoded_sequence_w_special) + assert special_tokens_mask_orig == special_tokens_mask diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index 72720bfa4e..cf6ab51d02 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer): return cls + token_ids_0 + sep + token_ids_1 + sep - def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py index 6f19ee699c..ceb1245169 100644 --- a/transformers/tokenization_roberta.py +++ b/transformers/tokenization_roberta.py @@ -100,7 +100,7 @@ class RobertaTokenizer(GPT2Tokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 527ee8422d..bb69df8698 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -820,7 +820,7 @@ class PreTrainedTokenizer(object): { input_ids: list[int], overflowing_tokens: list[int] if a ``max_length`` is specified, else None - sequence_ids: list[int] if ``add_special_tokens`` if set to ``True`` + special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` } With the fields: @@ -828,7 +828,7 @@ class PreTrainedTokenizer(object): ``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added tokens and 1 specifying sequence tokens. """ pair = bool(pair_ids is not None) @@ -857,7 +857,7 @@ class PreTrainedTokenizer(object): if add_special_tokens: sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence) - encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids) + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) else: sequence = ids + pair_ids if pair else ids token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) @@ -877,6 +877,7 @@ class PreTrainedTokenizer(object): if max_length and len(encoded_inputs["input_ids"]) > max_length: encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length] encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length] + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length] return encoded_inputs @@ -892,7 +893,7 @@ class PreTrainedTokenizer(object): logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") return token_ids_0 + token_ids_1 - def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) def convert_ids_to_tokens(self, ids, skip_special_tokens=False): diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py index 3f398853dc..817c6d8c44 100644 --- a/transformers/tokenization_xlm.py +++ b/transformers/tokenization_xlm.py @@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep - def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py index 325a32ef6f..37f975abef 100644 --- a/transformers/tokenization_xlnet.py +++ b/transformers/tokenization_xlnet.py @@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return token_ids_0 + sep + token_ids_1 + sep + cls - def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. From 9e136ff57c268bfe0c0bfb322401684aaba12d15 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Fri, 4 Oct 2019 15:00:56 -0400 Subject: [PATCH 09/11] Honor args.overwrite_cache (h/t @erenup) --- examples/run_glue.py | 2 +- examples/run_multiple_choice.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/run_glue.py b/examples/run_glue.py index fc3b617da0..3a6eac63ad 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -270,7 +270,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) - if os.path.exists(cached_features_file): + if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py index 2ce1327c98..223316cbcb 100644 --- a/examples/run_multiple_choice.py +++ b/examples/run_multiple_choice.py @@ -293,7 +293,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) - if os.path.exists(cached_features_file): + if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: From 6c1d0bc0665ef01710db301fb1a0a3c23778714a Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 4 Oct 2019 17:38:38 -0400 Subject: [PATCH 10/11] update encode_plus - add truncation strategies --- examples/run_lm_finetuning.py | 8 +- transformers/tests/tokenization_bert_test.py | 4 +- .../tests/tokenization_distilbert_test.py | 4 +- .../tests/tokenization_roberta_test.py | 4 +- .../tests/tokenization_tests_commons.py | 51 +++--- transformers/tests/tokenization_xlm_test.py | 4 +- transformers/tests/tokenization_xlnet_test.py | 4 +- transformers/tokenization_bert.py | 48 +++--- transformers/tokenization_roberta.py | 49 +++--- transformers/tokenization_utils.py | 147 ++++++++++-------- transformers/tokenization_xlm.py | 45 +++--- transformers/tokenization_xlnet.py | 47 +++--- 12 files changed, 222 insertions(+), 193 deletions(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 585bbc8d75..6c5e749868 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -75,7 +75,7 @@ class TextDataset(Dataset): tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size - self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[i:i+block_size])) + self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size])) # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. @@ -109,10 +109,8 @@ def mask_tokens(inputs, tokenizer, args): labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) - probability_matrix *= torch.tensor( - [tokenizer.get_special_tokens_mask(val, special_tokens_present=True) for val in labels.tolist()], - dtype=torch.float - ) + special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()] + probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -1 # We only compute loss on masked tokens diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py index b70941f884..5e49e2915b 100644 --- a/transformers/tests/tokenization_bert_test.py +++ b/transformers/tests/tokenization_bert_test.py @@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") - encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) - encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) assert encoded_sentence == [101] + text + [102] assert encoded_pair == [101] + text + [102] + text_2 + [102] diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py index 64a88df99f..a18d644fe8 100644 --- a/transformers/tests/tokenization_distilbert_test.py +++ b/transformers/tests/tokenization_distilbert_test.py @@ -36,8 +36,8 @@ class DistilBertTokenizationTest(BertTokenizationTest): text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") - encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) - encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \ diff --git a/transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py index f14b26a2e4..a731ac26c9 100644 --- a/transformers/tests/tokenization_roberta_test.py +++ b/transformers/tests/tokenization_roberta_test.py @@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True) encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True) - encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) - encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) assert encoded_sentence == encoded_text_from_decode assert encoded_pair == encoded_pair_from_decode diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index c902347fe5..b8f9295633 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -193,12 +193,12 @@ class CommonTestCases: tokenizer = self.get_tokenizer() - if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer": + if tokenizer.build_inputs_with_special_tokens.__qualname__.split('.')[0] != "PreTrainedTokenizer": seq_0 = "Test this method." seq_1 = "With these inputs." information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True) sequences, mask = information["input_ids"], information["token_type_ids"] - assert len(sequences) == len(mask) + self.assertEqual(len(sequences), len(mask)) def test_number_of_added_tokens(self): tokenizer = self.get_tokenizer() @@ -211,7 +211,7 @@ class CommonTestCases: # Method is implemented (e.g. not GPT-2) if len(attached_sequences) != 2: - assert tokenizer.num_added_tokens(pair=True) == len(attached_sequences) - len(sequences) + self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences)) def test_maximum_encoding_length_single_input(self): tokenizer = self.get_tokenizer() @@ -227,10 +227,10 @@ class CommonTestCases: truncated_sequence = information["input_ids"] overflowing_tokens = information["overflowing_tokens"] - assert len(overflowing_tokens) == 2 + stride - assert overflowing_tokens == sequence[-(2 + stride):] - assert len(truncated_sequence) == total_length - 2 - assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2]) + self.assertEqual(len(overflowing_tokens), 2 + stride) + self.assertEqual(overflowing_tokens, sequence[-(2 + stride):]) + self.assertEqual(len(truncated_sequence), total_length - 2) + self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2])) def test_maximum_encoding_length_pair_input(self): tokenizer = self.get_tokenizer() @@ -243,7 +243,7 @@ class CommonTestCases: sequence_1_no_special_tokens = tokenizer.encode(seq_1) sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) - truncated_second_sequence = tokenizer.add_special_tokens_sequence_pair( + truncated_second_sequence = tokenizer.build_inputs_with_special_tokens( tokenizer.encode(seq_0), tokenizer.encode(seq_1)[:-2] ) @@ -258,11 +258,11 @@ class CommonTestCases: overflowing_tokens = information["overflowing_tokens"] overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"] - assert len(overflowing_tokens) == 2 + stride - assert overflowing_tokens == sequence_1_no_special_tokens[-(2 + stride):] - assert overflowing_tokens_first_truncated == sequence_0_no_special_tokens[-(2 + stride):] - assert len(truncated_sequence) == len(sequence) - 2 - assert truncated_sequence == truncated_second_sequence + self.assertEqual(len(overflowing_tokens), 2 + stride) + self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride):]) + self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride):]) + self.assertEqual(len(truncated_sequence), len(sequence) - 2) + self.assertEqual(truncated_sequence, truncated_second_sequence) def test_encode_input_type(self): tokenizer = self.get_tokenizer() @@ -273,8 +273,8 @@ class CommonTestCases: input_ids = tokenizer.convert_tokens_to_ids(tokens) formatted_input = tokenizer.encode(sequence, add_special_tokens=True) - assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input - assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input + self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input) + self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input) def test_special_tokens_mask(self): tokenizer = self.get_tokenizer() @@ -287,22 +287,22 @@ class CommonTestCases: encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] - assert len(special_tokens_mask) == len(encoded_sequence_w_special) + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) - filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)] filtered_sequence = [x for x in filtered_sequence if x is not None] - assert encoded_sequence == filtered_sequence + self.assertEqual(encoded_sequence, filtered_sequence) # Testing inputs pairs encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] - assert len(special_tokens_mask) == len(encoded_sequence_w_special) + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) - filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)] filtered_sequence = [x for x in filtered_sequence if x is not None] - assert encoded_sequence == filtered_sequence + self.assertEqual(encoded_sequence, filtered_sequence) # Testing with already existing special tokens if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id: @@ -310,9 +310,6 @@ class CommonTestCases: encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"] - special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, special_tokens_present=True) - assert len(special_tokens_mask) == len(encoded_sequence_w_special) - assert special_tokens_mask_orig == special_tokens_mask - - - + special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True) + self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) + self.assertEqual(special_tokens_mask_orig, special_tokens_mask) diff --git a/transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py index b1a71ede59..0949b0cce4 100644 --- a/transformers/tests/tokenization_xlm_test.py +++ b/transformers/tests/tokenization_xlm_test.py @@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") - encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) - encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) assert encoded_sentence == [1] + text + [1] assert encoded_pair == [1] + text + [1] + text_2 + [1] diff --git a/transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py index f4418c7fe5..1a5dbcf6df 100644 --- a/transformers/tests/tokenization_xlnet_test.py +++ b/transformers/tests/tokenization_xlnet_test.py @@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester): text = tokenizer.encode("sequence builders") text_2 = tokenizer.encode("multi-sequence build") - encoded_sentence = tokenizer.add_special_tokens_single_sequence(text) - encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2) + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) assert encoded_sentence == text + [4, 3] assert encoded_pair == text + [4] + text_2 + [4, 3] diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index cf6ab51d02..d256f27a58 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -187,24 +187,21 @@ class BertTokenizer(PreTrainedTokenizer): out_string = ' '.join(tokens).replace(' ##', '').strip() return out_string - def add_special_tokens_single_sequence(self, token_ids): + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ - Adds special tokens to the a sequence for sequence classification tasks. - A BERT sequence has the following format: [CLS] X [SEP] + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A BERT sequence has the following format: + single sequence: [CLS] X [SEP] + pair of sequences: [CLS] A [SEP] B [SEP] """ - return [self.cls_token_id] + token_ids + [self.sep_token_id] - - def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): - """ - Adds special tokens to a sequence pair for sequence classification tasks. - A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP] - """ - sep = [self.sep_token_id] + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] - + sep = [self.sep_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. @@ -212,30 +209,37 @@ class BertTokenizer(PreTrainedTokenizer): Args: token_ids_0: list of ids (must not contain special tokens) token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model Returns: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ - if special_tokens_present: - return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0)) + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError("You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model.") + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - if token_ids_1: - return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0] - else: - return [0] + ([1] * len(token_ids_0)) + [0] + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format: 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 | first sequence | second sequence + + if token_ids_1 is None, only returns the first portion of the mask (0's). """ sep = [self.sep_token_id] cls = [self.cls_token_id] - + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, vocab_path): diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py index ceb1245169..9cc8a9af6e 100644 --- a/transformers/tokenization_roberta.py +++ b/transformers/tokenization_roberta.py @@ -84,23 +84,21 @@ class RobertaTokenizer(GPT2Tokenizer): self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens - def add_special_tokens_single_sequence(self, token_ids): + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ - Adds special tokens to a sequence for sequence classification tasks. - A RoBERTa sequence has the following format: X + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A RoBERTa sequence has the following format: + single sequence: X + pair of sequences: A B """ - return [self.cls_token_id] + token_ids + [self.sep_token_id] - - def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): - """ - Adds special tokens to a sequence pair for sequence classification tasks. - A RoBERTa sequence pair has the following format: A B - """ - sep = [self.sep_token_id] + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] + sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. @@ -108,28 +106,35 @@ class RobertaTokenizer(GPT2Tokenizer): Args: token_ids_0: list of ids (must not contain special tokens) token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model Returns: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError("You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model.") + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - if special_tokens_present: - return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0)) + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - if token_ids_1: - return [0] + ([1] * len(token_ids_0)) + [0, 0] + ([1] * len(token_ids_1)) + [0] - else: - return [0] + ([1] * len(token_ids_0)) + [0] - - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A RoBERTa sequence pair mask has the following format: 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 | first sequence | second sequence + + if token_ids_1 is None, only returns the first portion of the mask (0's). """ sep = [self.sep_token_id] cls = [self.cls_token_id] - return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] \ No newline at end of file + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 2f25e95f4b..ce5811b96f 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -538,15 +538,9 @@ class PreTrainedTokenizer(object): Returns: Number of tokens added to sequences """ - - if pair: - initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another")) - final_tokens_len = len(self.encode("This is a sequence", "This is another", add_special_tokens=True)) - else: - initial_tokens_len = len(self.encode("This is a sequence")) - final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True)) - - return final_tokens_len - initial_tokens_len + token_ids_0 = [] + token_ids_1 = [] + return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) def add_special_tokens(self, special_tokens_dict): """ @@ -795,7 +789,7 @@ class PreTrainedTokenizer(object): return_tensors=return_tensors) def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, - truncate_first_sequence=True, truncate_both_sequences=True, return_tensors=None): + truncation_strategy='longest_first', return_tensors=None): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates @@ -812,6 +806,12 @@ class PreTrainedTokenizer(object): to their model. stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential list of inputs. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) truncate_first_sequence: if set to `True` and an optional second list of input ids is provided, alongside a specified `max_length`, will truncate the first sequence if the total size is superior than the specified `max_length`. If set to `False`, will truncate the second sequence instead. @@ -840,37 +840,17 @@ class PreTrainedTokenizer(object): len_pair_ids = len(pair_ids) if pair else 0 encoded_inputs = {} - if max_length: - n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0 - - if n_added_tokens + len_ids + len_pair_ids > max_length: - if truncate_both_sequences: - tokens_a, tokens_b = self._truncate_seq_pair( - copy.deepcopy(ids), - copy.deepcopy(pair_ids), - max_length=max_length - n_added_tokens - ) - truncated_tokens = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):] - encoded_inputs["num_truncated_tokens"] = len(truncated_tokens) - ids = tokens_a - pair_ids = tokens_b - elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length: - logger.warning( - "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length." - "This pair of sequences will not be truncated.") - elif truncate_first_sequence or not pair: - encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:] - ids = ids[:max_length - len_pair_ids - n_added_tokens] - elif not truncate_first_sequence and pair: - encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:] - pair_ids = pair_ids[:max_length - len_ids - n_added_tokens] - else: - logger.warning( - "Cannot truncate second sequence as it is not provided. No truncation.") + total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0) + if max_length and total_len > max_length: + ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids, + num_tokens_to_remove=total_len-max_length, + truncation_strategy=truncation_strategy) + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = total_len - max_length if add_special_tokens: - sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids) - token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence) + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) else: sequence = ids + pair_ids if pair else ids @@ -895,39 +875,76 @@ class PreTrainedTokenizer(object): return encoded_inputs - def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" + def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first'): + """Truncates a sequence pair in place to the maximum length. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences). + Overflowing tokens only contains overflow from the first sequence. + - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + """ + if num_tokens_to_remove <= 0: + return ids, pair_ids, [] - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. + if truncation_strategy == 'longest_first': + overflowing_tokens = [] + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + overflowing_tokens.append(ids[-1]) + ids = ids[:-1] + else: + pair_ids = pair_ids[:-1] + elif truncation_strategy == 'only_first': + assert len(ids) > num_tokens_to_remove + overflowing_tokens = ids[-num_tokens_to_remove:] + ids = ids[:-num_tokens_to_remove] + elif truncation_strategy == 'only_second': + assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove + overflowing_tokens = pair_ids[-num_tokens_to_remove:] + pair_ids = pair_ids[:-num_tokens_to_remove] + elif truncation_strategy == 'do_not_truncate': + raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.") + else: + raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']") + return (ids, pair_ids, overflowing_tokens) - # However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger - # length or only pop from context - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - return (tokens_a, tokens_b) - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): logger.warning("This tokenizer does not make use of special tokens.") + if token_ids_1 is None: + return len(token_ids_0) * [0] return [0] * len(token_ids_0) + [1] * len(token_ids_1) - def add_special_tokens_single_sequence(self, token_ids): - logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.") - return token_ids - - def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): - logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A RoBERTa sequence has the following format: + single sequence: X + pair of sequences: A B + """ + logger.warning("This tokenizer does not make use of special tokens. Input is returned with no modification.") + if token_ids_1 is None: + return token_ids_0 return token_ids_0 + token_ids_1 - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): - return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) def convert_ids_to_tokens(self, ids, skip_special_tokens=False): """ Converts a single index or a sequence of indices (integers) in a token " diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py index 817c6d8c44..d09ce6b9dc 100644 --- a/transformers/tokenization_xlm.py +++ b/transformers/tokenization_xlm.py @@ -754,23 +754,21 @@ class XLMTokenizer(PreTrainedTokenizer): out_string = ''.join(tokens).replace('', ' ').strip() return out_string - def add_special_tokens_single_sequence(self, token_ids): + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ - Adds special tokens to a sequence for sequence classification tasks. - An XLM sequence has the following format: [CLS] X [SEP] - """ - return [self.cls_token_id] + token_ids + [self.sep_token_id] - - def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): - """ - Adds special tokens to a sequence pair for sequence classification tasks. - An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP] + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A RoBERTa sequence has the following format: + single sequence: X + pair of sequences: A B """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] sep = [self.sep_token_id] cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. @@ -778,30 +776,37 @@ class XLMTokenizer(PreTrainedTokenizer): Args: token_ids_0: list of ids (must not contain special tokens) token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model Returns: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ - if special_tokens_present: - return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0)) + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError("You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model.") + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - if token_ids_1: - return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0] - else: - return [0] + ([1] * len(token_ids_0)) + [0] + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence pair mask has the following format: 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 | first sequence | second sequence + + if token_ids_1 is None, only returns the first portion of the mask (0's). """ sep = [self.sep_token_id] cls = [self.cls_token_id] - + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, save_directory): diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py index 37f975abef..deae8de336 100644 --- a/transformers/tokenization_xlnet.py +++ b/transformers/tokenization_xlnet.py @@ -181,26 +181,21 @@ class XLNetTokenizer(PreTrainedTokenizer): out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() return out_string - def add_special_tokens_single_sequence(self, token_ids): + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ - Adds special tokens to a sequence for sequence classification tasks. - An XLNet sequence has the following format: X [SEP][CLS] + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A RoBERTa sequence has the following format: + single sequence: X + pair of sequences: A B """ sep = [self.sep_token_id] cls = [self.cls_token_id] - return token_ids + sep + cls - - def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1): - """ - Adds special tokens to a sequence pair for sequence classification tasks. - An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS] - """ - - sep = [self.sep_token_id] - cls = [self.cls_token_id] + if token_ids_1 is None: + return token_ids_0 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. @@ -208,31 +203,39 @@ class XLNetTokenizer(PreTrainedTokenizer): Args: token_ids_0: list of ids (must not contain special tokens) token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model Returns: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. """ - if special_tokens_present: - return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0)) + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError("You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model.") + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - if token_ids_1: - return ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0, 0] - else: - return ([1] * len(token_ids_0)) + [0, 0] + if token_ids_1 is not None: + return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] + return ([0] * len(token_ids_0)) + [1, 1] - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format: 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2 | first sequence | second sequence | CLS segment ID + + if token_ids_1 is None, only returns the first portion of the mask (0's). """ sep = [self.sep_token_id] cls = [self.cls_token_id] cls_segment_id = [2] + if token_ids_1 is None: + return len(token_ids_0 + sep + cls) * [0] return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id def save_vocabulary(self, save_directory): From 78ef1a99306213599d4eab8ae48f17a81d1ee2b8 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 4 Oct 2019 17:59:44 -0400 Subject: [PATCH 11/11] fixes --- examples/utils_multiple_choice.py | 1 - transformers/data/processors/glue.py | 1 - .../tests/tokenization_tests_commons.py | 4 +- transformers/tokenization_utils.py | 47 +++++++++++-------- 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py index a7fc1b1222..a131a63924 100644 --- a/examples/utils_multiple_choice.py +++ b/examples/utils_multiple_choice.py @@ -336,7 +336,6 @@ def convert_examples_to_features( text_b, add_special_tokens=True, max_length=max_length, - truncate_both_sequences=True ) if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0: logger.info('Attention! you are cropping tokens (swag task is ok). ' diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py index 61bca8c11b..741569ea30 100644 --- a/transformers/data/processors/glue.py +++ b/transformers/data/processors/glue.py @@ -86,7 +86,6 @@ def glue_convert_examples_to_features(examples, tokenizer, example.text_b, add_special_tokens=True, max_length=max_length, - truncate_first_sequence=True # We're truncating the first sequence in priority ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index b8f9295633..b2801d5f41 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -249,10 +249,10 @@ class CommonTestCases: ) information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True, - stride=stride, truncate_first_sequence=False) + stride=stride, truncation_strategy='only_second') information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True, stride=stride, - truncate_first_sequence=True) + truncation_strategy='only_first') truncated_sequence = information["input_ids"] overflowing_tokens = information["overflowing_tokens"] diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index ce5811b96f..c7b55f3a9c 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -692,8 +692,7 @@ class PreTrainedTokenizer(object): add_special_tokens=False, max_length=None, stride=0, - truncate_first_sequence=True, - truncate_both_sequences=False, + truncation_strategy='longest_first', return_tensors=None, **kwargs): """ @@ -714,8 +713,12 @@ class PreTrainedTokenizer(object): If there are overflowing tokens, those will be added to the returned dictionary stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. - truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence - will be truncated. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -725,8 +728,7 @@ class PreTrainedTokenizer(object): max_length=max_length, add_special_tokens=add_special_tokens, stride=stride, - truncate_first_sequence=truncate_first_sequence, - truncate_both_sequences=truncate_both_sequences, + truncation_strategy=truncation_strategy, return_tensors=return_tensors, **kwargs) @@ -738,8 +740,7 @@ class PreTrainedTokenizer(object): add_special_tokens=False, max_length=None, stride=0, - truncate_first_sequence=True, - truncate_both_sequences=False, + truncation_strategy='longest_first', return_tensors=None, **kwargs): """ @@ -759,8 +760,12 @@ class PreTrainedTokenizer(object): If there are overflowing tokens, those will be added to the returned dictionary stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens from the main sequence returned. The value of this argument defines the number of additional tokens. - truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence - will be truncated. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -784,8 +789,7 @@ class PreTrainedTokenizer(object): max_length=max_length, add_special_tokens=add_special_tokens, stride=stride, - truncate_first_sequence=truncate_first_sequence, - truncate_both_sequences=truncate_both_sequences, + truncation_strategy=truncation_strategy, return_tensors=return_tensors) def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, @@ -812,9 +816,6 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - truncate_first_sequence: if set to `True` and an optional second list of input ids is provided, - alongside a specified `max_length`, will truncate the first sequence if the total size is superior - than the specified `max_length`. If set to `False`, will truncate the second sequence instead. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. @@ -844,7 +845,8 @@ class PreTrainedTokenizer(object): if max_length and total_len > max_length: ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids, num_tokens_to_remove=total_len-max_length, - truncation_strategy=truncation_strategy) + truncation_strategy=truncation_strategy, + stride=stride) encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length @@ -875,7 +877,7 @@ class PreTrainedTokenizer(object): return encoded_inputs - def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first'): + def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0): """Truncates a sequence pair in place to the maximum length. truncation_strategy: string selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length @@ -892,17 +894,22 @@ class PreTrainedTokenizer(object): overflowing_tokens = [] for _ in range(num_tokens_to_remove): if pair_ids is None or len(ids) > len(pair_ids): - overflowing_tokens.append(ids[-1]) + overflowing_tokens = [ids[-1]] + overflowing_tokens ids = ids[:-1] else: pair_ids = pair_ids[:-1] + window_len = min(len(ids), stride) + if window_len > 0: + overflowing_tokens = ids[-window_len:] + overflowing_tokens elif truncation_strategy == 'only_first': assert len(ids) > num_tokens_to_remove - overflowing_tokens = ids[-num_tokens_to_remove:] + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] ids = ids[:-num_tokens_to_remove] elif truncation_strategy == 'only_second': assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove - overflowing_tokens = pair_ids[-num_tokens_to_remove:] + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] pair_ids = pair_ids[:-num_tokens_to_remove] elif truncation_strategy == 'do_not_truncate': raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")