From 2f259b228e9901e1bd46b9cb0692a40a9a7e98e7 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Mon, 30 Sep 2019 11:48:18 -0400 Subject: [PATCH] Sequence IDS --- .../tests/tokenization_tests_commons.py | 30 +++++++++++++++++++ transformers/tokenization_bert.py | 18 +++++++++++ transformers/tokenization_roberta.py | 18 +++++++++++ transformers/tokenization_utils.py | 20 ++++++++++++- transformers/tokenization_xlm.py | 18 +++++++++++ transformers/tokenization_xlnet.py | 18 +++++++++++ 6 files changed, 121 insertions(+), 1 deletion(-) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index 73a2cb44b3..f44be1f97b 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -292,3 +292,33 @@ class CommonTestCases: assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input + + def test_sequence_ids(self): + tokenizer = self.get_tokenizer() + + sequence_0 = "Encode this." + sequence_1 = "This one too please." + + # Testing single inputs + encoded_sequence = tokenizer.encode(sequence_0) + encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + sequence_ids = encoded_sequence_dict["sequence_ids"] + assert len(sequence_ids) == len(encoded_sequence_w_special) + + filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [x for x in filtered_sequence if x is not None] + assert encoded_sequence == filtered_sequence + + # Testing inputs pairs + encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1) + encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True) + encoded_sequence_w_special = encoded_sequence_dict["input_ids"] + sequence_ids = encoded_sequence_dict["sequence_ids"] + assert len(sequence_ids) == len(encoded_sequence_w_special) + + filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [x for x in filtered_sequence if x is not None] + assert encoded_sequence == filtered_sequence + + diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index 42163cb8ec..b05f88b7a8 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -204,6 +204,24 @@ class BertTokenizer(PreTrainedTokenizer): return cls + token_ids_0 + sep + token_ids_1 + sep + def get_sequence_ids(self, token_ids_0, token_ids_1=None): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + if token_ids_1: + return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0] + else: + return [0] + ([1] * len(token_ids_0)) + [0] + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py index 7adeea689e..c371ae3904 100644 --- a/transformers/tokenization_roberta.py +++ b/transformers/tokenization_roberta.py @@ -100,6 +100,24 @@ class RobertaTokenizer(GPT2Tokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep + def get_sequence_ids(self, token_ids_0, token_ids_1=None): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + if token_ids_1: + return [0] + ([1] * len(token_ids_0)) + [0, 0] + ([1] * len(token_ids_1)) + [0] + else: + return [0] + ([1] * len(token_ids_0)) + [0] + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index fc7fe1df67..5aa7f55730 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -826,7 +826,21 @@ class PreTrainedTokenizer(object): or PyTorch torch.Tensor instead of a list of python integers. Return: - a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given. + A Dictionary of shape:: + + { + input_ids: list[int], + overflowing_tokens: list[int] if a ``max_length`` is specified, else None + sequence_ids: list[int] if ``add_special_tokens`` if set to ``True`` + } + + With the fields: + ``input_ids``: list of tokens to be fed to a model + + ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + + ``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. """ pair = bool(pair_ids is not None) len_ids = len(ids) @@ -859,6 +873,7 @@ class PreTrainedTokenizer(object): if add_special_tokens: sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence) + encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids) else: sequence = ids + pair_ids if pair else ids token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) @@ -893,6 +908,9 @@ class PreTrainedTokenizer(object): logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") return token_ids_0 + token_ids_1 + def get_sequence_ids(self, token_ids_0, token_ids_1=None): + return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): """ Converts a single index or a sequence of indices (integers) in a token " (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens. diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py index f1e49416a4..b6e27ee59e 100644 --- a/transformers/tokenization_xlm.py +++ b/transformers/tokenization_xlm.py @@ -770,6 +770,24 @@ class XLMTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep + def get_sequence_ids(self, token_ids_0, token_ids_1=None): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + if token_ids_1: + return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0] + else: + return [0] + ([1] * len(token_ids_0)) + [0] + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py index ad9efdf043..c8d59decee 100644 --- a/transformers/tokenization_xlnet.py +++ b/transformers/tokenization_xlnet.py @@ -200,6 +200,24 @@ class XLNetTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return token_ids_0 + sep + token_ids_1 + sep + cls + def get_sequence_ids(self, token_ids_0, token_ids_1=None): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + if token_ids_1: + return ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0, 0] + else: + return ([1] * len(token_ids_0)) + [0, 0] + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task.