Sequence IDS

This commit is contained in:
LysandreJik
2019-09-30 11:48:18 -04:00
parent 7c789c337d
commit 2f259b228e
6 changed files with 121 additions and 1 deletions

View File

@@ -292,3 +292,33 @@ class CommonTestCases:
assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
def test_sequence_ids(self):
tokenizer = self.get_tokenizer()
sequence_0 = "Encode this."
sequence_1 = "This one too please."
# Testing single inputs
encoded_sequence = tokenizer.encode(sequence_0)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
sequence_ids = encoded_sequence_dict["sequence_ids"]
assert len(sequence_ids) == len(encoded_sequence_w_special)
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [x for x in filtered_sequence if x is not None]
assert encoded_sequence == filtered_sequence
# Testing inputs pairs
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
sequence_ids = encoded_sequence_dict["sequence_ids"]
assert len(sequence_ids) == len(encoded_sequence_w_special)
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [x for x in filtered_sequence if x is not None]
assert encoded_sequence == filtered_sequence

View File

@@ -204,6 +204,24 @@ class BertTokenizer(PreTrainedTokenizer):
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if token_ids_1:
return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
else:
return [0] + ([1] * len(token_ids_0)) + [0]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.

View File

@@ -100,6 +100,24 @@ class RobertaTokenizer(GPT2Tokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if token_ids_1:
return [0] + ([1] * len(token_ids_0)) + [0, 0] + ([1] * len(token_ids_1)) + [0]
else:
return [0] + ([1] * len(token_ids_0)) + [0]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.

View File

@@ -826,7 +826,21 @@ class PreTrainedTokenizer(object):
or PyTorch torch.Tensor instead of a list of python integers. or PyTorch torch.Tensor instead of a list of python integers.
Return: Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given. A Dictionary of shape::
{
input_ids: list[int],
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
sequence_ids: list[int] if ``add_special_tokens`` if set to ``True``
}
With the fields:
``input_ids``: list of tokens to be fed to a model
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
""" """
pair = bool(pair_ids is not None) pair = bool(pair_ids is not None)
len_ids = len(ids) len_ids = len(ids)
@@ -859,6 +873,7 @@ class PreTrainedTokenizer(object):
if add_special_tokens: if add_special_tokens:
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids) sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids)
else: else:
sequence = ids + pair_ids if pair else ids sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
@@ -893,6 +908,9 @@ class PreTrainedTokenizer(object):
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
return token_ids_0 + token_ids_1 return token_ids_0 + token_ids_1
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
def convert_ids_to_tokens(self, ids, skip_special_tokens=False): def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
""" Converts a single index or a sequence of indices (integers) in a token " """ Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens. (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.

View File

@@ -770,6 +770,24 @@ class XLMTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if token_ids_1:
return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
else:
return [0] + ([1] * len(token_ids_0)) + [0]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.

View File

@@ -200,6 +200,24 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return token_ids_0 + sep + token_ids_1 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if token_ids_1:
return ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0, 0]
else:
return ([1] * len(token_ids_0)) + [0, 0]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.