update encode_plus - add truncation strategies
This commit is contained in:
@@ -75,7 +75,7 @@ class TextDataset(Dataset):
|
|||||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
||||||
|
|
||||||
for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
|
for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
|
||||||
self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[i:i+block_size]))
|
self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
|
||||||
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
||||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||||
# can change this behavior by adding (model specific) padding.
|
# can change this behavior by adding (model specific) padding.
|
||||||
@@ -109,10 +109,8 @@ def mask_tokens(inputs, tokenizer, args):
|
|||||||
labels = inputs.clone()
|
labels = inputs.clone()
|
||||||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
||||||
probability_matrix = torch.full(labels.shape, args.mlm_probability)
|
probability_matrix = torch.full(labels.shape, args.mlm_probability)
|
||||||
probability_matrix *= torch.tensor(
|
special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
|
||||||
[tokenizer.get_special_tokens_mask(val, special_tokens_present=True) for val in labels.tolist()],
|
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
|
||||||
dtype=torch.float
|
|
||||||
)
|
|
||||||
masked_indices = torch.bernoulli(probability_matrix).bool()
|
masked_indices = torch.bernoulli(probability_matrix).bool()
|
||||||
labels[~masked_indices] = -1 # We only compute loss on masked tokens
|
labels[~masked_indices] = -1 # We only compute loss on masked tokens
|
||||||
|
|
||||||
|
|||||||
@@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders")
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build")
|
||||||
|
|
||||||
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
assert encoded_sentence == [101] + text + [102]
|
assert encoded_sentence == [101] + text + [102]
|
||||||
assert encoded_pair == [101] + text + [102] + text_2 + [102]
|
assert encoded_pair == [101] + text + [102] + text_2 + [102]
|
||||||
|
|||||||
@@ -36,8 +36,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
|
|||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders")
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build")
|
||||||
|
|
||||||
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
|
assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
|
||||||
assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
|
assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
|
||||||
|
|||||||
@@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
|
encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
|
||||||
encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
|
encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
|
||||||
|
|
||||||
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
assert encoded_sentence == encoded_text_from_decode
|
assert encoded_sentence == encoded_text_from_decode
|
||||||
assert encoded_pair == encoded_pair_from_decode
|
assert encoded_pair == encoded_pair_from_decode
|
||||||
|
|||||||
@@ -193,12 +193,12 @@ class CommonTestCases:
|
|||||||
|
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
|
if tokenizer.build_inputs_with_special_tokens.__qualname__.split('.')[0] != "PreTrainedTokenizer":
|
||||||
seq_0 = "Test this method."
|
seq_0 = "Test this method."
|
||||||
seq_1 = "With these inputs."
|
seq_1 = "With these inputs."
|
||||||
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
|
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
|
||||||
sequences, mask = information["input_ids"], information["token_type_ids"]
|
sequences, mask = information["input_ids"], information["token_type_ids"]
|
||||||
assert len(sequences) == len(mask)
|
self.assertEqual(len(sequences), len(mask))
|
||||||
|
|
||||||
def test_number_of_added_tokens(self):
|
def test_number_of_added_tokens(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
@@ -211,7 +211,7 @@ class CommonTestCases:
|
|||||||
|
|
||||||
# Method is implemented (e.g. not GPT-2)
|
# Method is implemented (e.g. not GPT-2)
|
||||||
if len(attached_sequences) != 2:
|
if len(attached_sequences) != 2:
|
||||||
assert tokenizer.num_added_tokens(pair=True) == len(attached_sequences) - len(sequences)
|
self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
|
||||||
|
|
||||||
def test_maximum_encoding_length_single_input(self):
|
def test_maximum_encoding_length_single_input(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
@@ -227,10 +227,10 @@ class CommonTestCases:
|
|||||||
truncated_sequence = information["input_ids"]
|
truncated_sequence = information["input_ids"]
|
||||||
overflowing_tokens = information["overflowing_tokens"]
|
overflowing_tokens = information["overflowing_tokens"]
|
||||||
|
|
||||||
assert len(overflowing_tokens) == 2 + stride
|
self.assertEqual(len(overflowing_tokens), 2 + stride)
|
||||||
assert overflowing_tokens == sequence[-(2 + stride):]
|
self.assertEqual(overflowing_tokens, sequence[-(2 + stride):])
|
||||||
assert len(truncated_sequence) == total_length - 2
|
self.assertEqual(len(truncated_sequence), total_length - 2)
|
||||||
assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2])
|
self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
|
||||||
|
|
||||||
def test_maximum_encoding_length_pair_input(self):
|
def test_maximum_encoding_length_pair_input(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
@@ -243,7 +243,7 @@ class CommonTestCases:
|
|||||||
sequence_1_no_special_tokens = tokenizer.encode(seq_1)
|
sequence_1_no_special_tokens = tokenizer.encode(seq_1)
|
||||||
|
|
||||||
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
|
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
|
||||||
truncated_second_sequence = tokenizer.add_special_tokens_sequence_pair(
|
truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
|
||||||
tokenizer.encode(seq_0),
|
tokenizer.encode(seq_0),
|
||||||
tokenizer.encode(seq_1)[:-2]
|
tokenizer.encode(seq_1)[:-2]
|
||||||
)
|
)
|
||||||
@@ -258,11 +258,11 @@ class CommonTestCases:
|
|||||||
overflowing_tokens = information["overflowing_tokens"]
|
overflowing_tokens = information["overflowing_tokens"]
|
||||||
overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
|
overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
|
||||||
|
|
||||||
assert len(overflowing_tokens) == 2 + stride
|
self.assertEqual(len(overflowing_tokens), 2 + stride)
|
||||||
assert overflowing_tokens == sequence_1_no_special_tokens[-(2 + stride):]
|
self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride):])
|
||||||
assert overflowing_tokens_first_truncated == sequence_0_no_special_tokens[-(2 + stride):]
|
self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride):])
|
||||||
assert len(truncated_sequence) == len(sequence) - 2
|
self.assertEqual(len(truncated_sequence), len(sequence) - 2)
|
||||||
assert truncated_sequence == truncated_second_sequence
|
self.assertEqual(truncated_sequence, truncated_second_sequence)
|
||||||
|
|
||||||
def test_encode_input_type(self):
|
def test_encode_input_type(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
@@ -273,8 +273,8 @@ class CommonTestCases:
|
|||||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||||
formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
|
formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
|
||||||
|
|
||||||
assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
|
self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
|
||||||
assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
|
self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
|
||||||
|
|
||||||
def test_special_tokens_mask(self):
|
def test_special_tokens_mask(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
@@ -287,22 +287,22 @@ class CommonTestCases:
|
|||||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
||||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||||
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||||
assert len(special_tokens_mask) == len(encoded_sequence_w_special)
|
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
||||||
|
|
||||||
filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
||||||
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
||||||
assert encoded_sequence == filtered_sequence
|
self.assertEqual(encoded_sequence, filtered_sequence)
|
||||||
|
|
||||||
# Testing inputs pairs
|
# Testing inputs pairs
|
||||||
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
|
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
|
||||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
|
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
|
||||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||||
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||||
assert len(special_tokens_mask) == len(encoded_sequence_w_special)
|
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
||||||
|
|
||||||
filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
||||||
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
||||||
assert encoded_sequence == filtered_sequence
|
self.assertEqual(encoded_sequence, filtered_sequence)
|
||||||
|
|
||||||
# Testing with already existing special tokens
|
# Testing with already existing special tokens
|
||||||
if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
|
if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
|
||||||
@@ -310,9 +310,6 @@ class CommonTestCases:
|
|||||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
||||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||||
special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
|
special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
|
||||||
special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, special_tokens_present=True)
|
special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True)
|
||||||
assert len(special_tokens_mask) == len(encoded_sequence_w_special)
|
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
||||||
assert special_tokens_mask_orig == special_tokens_mask
|
self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders")
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build")
|
||||||
|
|
||||||
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
assert encoded_sentence == [1] + text + [1]
|
assert encoded_sentence == [1] + text + [1]
|
||||||
assert encoded_pair == [1] + text + [1] + text_2 + [1]
|
assert encoded_pair == [1] + text + [1] + text_2 + [1]
|
||||||
|
|||||||
@@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders")
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build")
|
||||||
|
|
||||||
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
assert encoded_sentence == text + [4, 3]
|
assert encoded_sentence == text + [4, 3]
|
||||||
assert encoded_pair == text + [4] + text_2 + [4, 3]
|
assert encoded_pair == text + [4] + text_2 + [4, 3]
|
||||||
|
|||||||
@@ -187,24 +187,21 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
out_string = ' '.join(tokens).replace(' ##', '').strip()
|
out_string = ' '.join(tokens).replace(' ##', '').strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to the a sequence for sequence classification tasks.
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
A BERT sequence has the following format: [CLS] X [SEP]
|
by concatenating and adding special tokens.
|
||||||
|
A BERT sequence has the following format:
|
||||||
|
single sequence: [CLS] X [SEP]
|
||||||
|
pair of sequences: [CLS] A [SEP] B [SEP]
|
||||||
"""
|
"""
|
||||||
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
if token_ids_1 is None:
|
||||||
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
|
|
||||||
"""
|
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
|
||||||
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
sep = [self.sep_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
@@ -213,29 +210,36 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
token_ids_0: list of ids (must not contain special tokens)
|
token_ids_0: list of ids (must not contain special tokens)
|
||||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||||
for sequence pairs
|
for sequence pairs
|
||||||
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
||||||
|
special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if special_tokens_present:
|
if already_has_special_tokens:
|
||||||
return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
|
if token_ids_1 is not None:
|
||||||
|
raise ValueError("You should not supply a second sequence if the provided sequence of "
|
||||||
|
"ids is already formated with special tokens for the model.")
|
||||||
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||||
|
|
||||||
if token_ids_1:
|
if token_ids_1 is not None:
|
||||||
return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
else:
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
return [0] + ([1] * len(token_ids_0)) + [0]
|
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A BERT sequence pair mask has the following format:
|
A BERT sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
| first sequence | second sequence
|
| first sequence | second sequence
|
||||||
|
|
||||||
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(cls + token_ids_0 + sep) * [0]
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, vocab_path):
|
def save_vocabulary(self, vocab_path):
|
||||||
|
|||||||
@@ -84,23 +84,21 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||||
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
|
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence for sequence classification tasks.
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
A RoBERTa sequence has the following format: <s> X </s>
|
by concatenating and adding special tokens.
|
||||||
|
A RoBERTa sequence has the following format:
|
||||||
|
single sequence: <s> X </s>
|
||||||
|
pair of sequences: <s> A </s></s> B </s>
|
||||||
"""
|
"""
|
||||||
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
if token_ids_1 is None:
|
||||||
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
|
|
||||||
"""
|
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
|
||||||
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
sep = [self.sep_token_id]
|
||||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
@@ -109,27 +107,34 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
token_ids_0: list of ids (must not contain special tokens)
|
token_ids_0: list of ids (must not contain special tokens)
|
||||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||||
for sequence pairs
|
for sequence pairs
|
||||||
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
||||||
|
special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
"""
|
"""
|
||||||
|
if already_has_special_tokens:
|
||||||
|
if token_ids_1 is not None:
|
||||||
|
raise ValueError("You should not supply a second sequence if the provided sequence of "
|
||||||
|
"ids is already formated with special tokens for the model.")
|
||||||
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||||
|
|
||||||
if special_tokens_present:
|
if token_ids_1 is None:
|
||||||
return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
|
||||||
if token_ids_1:
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
return [0] + ([1] * len(token_ids_0)) + [0, 0] + ([1] * len(token_ids_1)) + [0]
|
|
||||||
else:
|
|
||||||
return [0] + ([1] * len(token_ids_0)) + [0]
|
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A RoBERTa sequence pair mask has the following format:
|
A RoBERTa sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
| first sequence | second sequence
|
| first sequence | second sequence
|
||||||
|
|
||||||
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(cls + token_ids_0 + sep) * [0]
|
||||||
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
|
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
@@ -538,15 +538,9 @@ class PreTrainedTokenizer(object):
|
|||||||
Returns:
|
Returns:
|
||||||
Number of tokens added to sequences
|
Number of tokens added to sequences
|
||||||
"""
|
"""
|
||||||
|
token_ids_0 = []
|
||||||
if pair:
|
token_ids_1 = []
|
||||||
initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another"))
|
return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
|
||||||
final_tokens_len = len(self.encode("This is a sequence", "This is another", add_special_tokens=True))
|
|
||||||
else:
|
|
||||||
initial_tokens_len = len(self.encode("This is a sequence"))
|
|
||||||
final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True))
|
|
||||||
|
|
||||||
return final_tokens_len - initial_tokens_len
|
|
||||||
|
|
||||||
def add_special_tokens(self, special_tokens_dict):
|
def add_special_tokens(self, special_tokens_dict):
|
||||||
"""
|
"""
|
||||||
@@ -795,7 +789,7 @@ class PreTrainedTokenizer(object):
|
|||||||
return_tensors=return_tensors)
|
return_tensors=return_tensors)
|
||||||
|
|
||||||
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
|
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
|
||||||
truncate_first_sequence=True, truncate_both_sequences=True, return_tensors=None):
|
truncation_strategy='longest_first', return_tensors=None):
|
||||||
"""
|
"""
|
||||||
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
|
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
|
||||||
It adds special tokens, truncates
|
It adds special tokens, truncates
|
||||||
@@ -812,6 +806,12 @@ class PreTrainedTokenizer(object):
|
|||||||
to their model.
|
to their model.
|
||||||
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
|
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
|
||||||
list of inputs.
|
list of inputs.
|
||||||
|
truncation_strategy: string selected in the following options:
|
||||||
|
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
|
||||||
|
starting from the longest one at each token (when there is a pair of input sequences)
|
||||||
|
- 'only_first': Only truncate the first sequence
|
||||||
|
- 'only_second': Only truncate the second sequence
|
||||||
|
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
|
||||||
truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
|
truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
|
||||||
alongside a specified `max_length`, will truncate the first sequence if the total size is superior
|
alongside a specified `max_length`, will truncate the first sequence if the total size is superior
|
||||||
than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
|
than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
|
||||||
@@ -840,37 +840,17 @@ class PreTrainedTokenizer(object):
|
|||||||
len_pair_ids = len(pair_ids) if pair else 0
|
len_pair_ids = len(pair_ids) if pair else 0
|
||||||
|
|
||||||
encoded_inputs = {}
|
encoded_inputs = {}
|
||||||
if max_length:
|
total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
|
||||||
n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
|
if max_length and total_len > max_length:
|
||||||
|
ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
|
||||||
if n_added_tokens + len_ids + len_pair_ids > max_length:
|
num_tokens_to_remove=total_len-max_length,
|
||||||
if truncate_both_sequences:
|
truncation_strategy=truncation_strategy)
|
||||||
tokens_a, tokens_b = self._truncate_seq_pair(
|
encoded_inputs["overflowing_tokens"] = overflowing_tokens
|
||||||
copy.deepcopy(ids),
|
encoded_inputs["num_truncated_tokens"] = total_len - max_length
|
||||||
copy.deepcopy(pair_ids),
|
|
||||||
max_length=max_length - n_added_tokens
|
|
||||||
)
|
|
||||||
truncated_tokens = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
|
|
||||||
encoded_inputs["num_truncated_tokens"] = len(truncated_tokens)
|
|
||||||
ids = tokens_a
|
|
||||||
pair_ids = tokens_b
|
|
||||||
elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
|
|
||||||
logger.warning(
|
|
||||||
"You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
|
|
||||||
"This pair of sequences will not be truncated.")
|
|
||||||
elif truncate_first_sequence or not pair:
|
|
||||||
encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
|
|
||||||
ids = ids[:max_length - len_pair_ids - n_added_tokens]
|
|
||||||
elif not truncate_first_sequence and pair:
|
|
||||||
encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
|
|
||||||
pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
"Cannot truncate second sequence as it is not provided. No truncation.")
|
|
||||||
|
|
||||||
if add_special_tokens:
|
if add_special_tokens:
|
||||||
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
|
sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
|
||||||
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
|
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
|
||||||
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
|
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
|
||||||
else:
|
else:
|
||||||
sequence = ids + pair_ids if pair else ids
|
sequence = ids + pair_ids if pair else ids
|
||||||
@@ -895,39 +875,76 @@ class PreTrainedTokenizer(object):
|
|||||||
|
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
|
def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first'):
|
||||||
"""Truncates a sequence pair in place to the maximum length."""
|
"""Truncates a sequence pair in place to the maximum length.
|
||||||
|
truncation_strategy: string selected in the following options:
|
||||||
|
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
|
||||||
|
starting from the longest one at each token (when there is a pair of input sequences).
|
||||||
|
Overflowing tokens only contains overflow from the first sequence.
|
||||||
|
- 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
|
||||||
|
- 'only_second': Only truncate the second sequence
|
||||||
|
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
|
||||||
|
"""
|
||||||
|
if num_tokens_to_remove <= 0:
|
||||||
|
return ids, pair_ids, []
|
||||||
|
|
||||||
# This is a simple heuristic which will always truncate the longer sequence
|
if truncation_strategy == 'longest_first':
|
||||||
# one token at a time. This makes more sense than truncating an equal percent
|
overflowing_tokens = []
|
||||||
# of tokens from each, since if one sequence is very short then each token
|
for _ in range(num_tokens_to_remove):
|
||||||
# that's truncated likely contains more information than a longer sequence.
|
if pair_ids is None or len(ids) > len(pair_ids):
|
||||||
|
overflowing_tokens.append(ids[-1])
|
||||||
# However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger
|
ids = ids[:-1]
|
||||||
# length or only pop from context
|
|
||||||
while True:
|
|
||||||
total_length = len(tokens_a) + len(tokens_b)
|
|
||||||
if total_length <= max_length:
|
|
||||||
return (tokens_a, tokens_b)
|
|
||||||
if len(tokens_a) > len(tokens_b):
|
|
||||||
tokens_a.pop()
|
|
||||||
else:
|
else:
|
||||||
tokens_b.pop()
|
pair_ids = pair_ids[:-1]
|
||||||
|
elif truncation_strategy == 'only_first':
|
||||||
|
assert len(ids) > num_tokens_to_remove
|
||||||
|
overflowing_tokens = ids[-num_tokens_to_remove:]
|
||||||
|
ids = ids[:-num_tokens_to_remove]
|
||||||
|
elif truncation_strategy == 'only_second':
|
||||||
|
assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
|
||||||
|
overflowing_tokens = pair_ids[-num_tokens_to_remove:]
|
||||||
|
pair_ids = pair_ids[:-num_tokens_to_remove]
|
||||||
|
elif truncation_strategy == 'do_not_truncate':
|
||||||
|
raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
|
||||||
|
else:
|
||||||
|
raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']")
|
||||||
|
return (ids, pair_ids, overflowing_tokens)
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
logger.warning("This tokenizer does not make use of special tokens.")
|
logger.warning("This tokenizer does not make use of special tokens.")
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(token_ids_0) * [0]
|
||||||
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
|
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
|
"""
|
||||||
return token_ids
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
|
by concatenating and adding special tokens.
|
||||||
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
|
A RoBERTa sequence has the following format:
|
||||||
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
|
single sequence: <s> X </s>
|
||||||
|
pair of sequences: <s> A </s></s> B </s>
|
||||||
|
"""
|
||||||
|
logger.warning("This tokenizer does not make use of special tokens. Input is returned with no modification.")
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return token_ids_0
|
||||||
return token_ids_0 + token_ids_1
|
return token_ids_0 + token_ids_1
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
|
"""
|
||||||
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0: list of ids (must not contain special tokens)
|
||||||
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||||
|
for sequence pairs
|
||||||
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
||||||
|
special tokens for the model
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
|
"""
|
||||||
|
return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
|
||||||
|
|
||||||
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
||||||
""" Converts a single index or a sequence of indices (integers) in a token "
|
""" Converts a single index or a sequence of indices (integers) in a token "
|
||||||
|
|||||||
@@ -754,23 +754,21 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
out_string = ''.join(tokens).replace('</w>', ' ').strip()
|
out_string = ''.join(tokens).replace('</w>', ' ').strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence for sequence classification tasks.
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
An XLM sequence has the following format: [CLS] X [SEP]
|
by concatenating and adding special tokens.
|
||||||
"""
|
A RoBERTa sequence has the following format:
|
||||||
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
single sequence: <s> X </s>
|
||||||
|
pair of sequences: <s> A </s></s> B </s>
|
||||||
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
|
|
||||||
"""
|
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
|
||||||
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
|
||||||
"""
|
"""
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
@@ -779,29 +777,36 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
token_ids_0: list of ids (must not contain special tokens)
|
token_ids_0: list of ids (must not contain special tokens)
|
||||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||||
for sequence pairs
|
for sequence pairs
|
||||||
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
||||||
|
special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if special_tokens_present:
|
if already_has_special_tokens:
|
||||||
return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
|
if token_ids_1 is not None:
|
||||||
|
raise ValueError("You should not supply a second sequence if the provided sequence of "
|
||||||
|
"ids is already formated with special tokens for the model.")
|
||||||
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||||
|
|
||||||
if token_ids_1:
|
if token_ids_1 is not None:
|
||||||
return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
else:
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
return [0] + ([1] * len(token_ids_0)) + [0]
|
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
An XLM sequence pair mask has the following format:
|
An XLM sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
| first sequence | second sequence
|
| first sequence | second sequence
|
||||||
|
|
||||||
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(cls + token_ids_0 + sep) * [0]
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
|
|||||||
@@ -181,26 +181,21 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
|
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence for sequence classification tasks.
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
An XLNet sequence has the following format: X [SEP][CLS]
|
by concatenating and adding special tokens.
|
||||||
|
A RoBERTa sequence has the following format:
|
||||||
|
single sequence: <s> X </s>
|
||||||
|
pair of sequences: <s> A </s></s> B </s>
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return token_ids + sep + cls
|
if token_ids_1 is None:
|
||||||
|
return token_ids_0 + sep + cls
|
||||||
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
|
|
||||||
"""
|
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
|
||||||
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
|
|
||||||
"""
|
|
||||||
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
return token_ids_0 + sep + token_ids_1 + sep + cls
|
return token_ids_0 + sep + token_ids_1 + sep + cls
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
@@ -209,30 +204,38 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
token_ids_0: list of ids (must not contain special tokens)
|
token_ids_0: list of ids (must not contain special tokens)
|
||||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||||
for sequence pairs
|
for sequence pairs
|
||||||
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
||||||
|
special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if special_tokens_present:
|
if already_has_special_tokens:
|
||||||
return list(map(lambda x: 0 if x in [self.sep_token_id, self.cls_token_id] else 1, token_ids_0))
|
if token_ids_1 is not None:
|
||||||
|
raise ValueError("You should not supply a second sequence if the provided sequence of "
|
||||||
|
"ids is already formated with special tokens for the model.")
|
||||||
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||||
|
|
||||||
if token_ids_1:
|
if token_ids_1 is not None:
|
||||||
return ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0, 0]
|
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
|
||||||
else:
|
return ([0] * len(token_ids_0)) + [1, 1]
|
||||||
return ([1] * len(token_ids_0)) + [0, 0]
|
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A BERT sequence pair mask has the following format:
|
A BERT sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
|
||||||
| first sequence | second sequence | CLS segment ID
|
| first sequence | second sequence | CLS segment ID
|
||||||
|
|
||||||
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
cls_segment_id = [2]
|
cls_segment_id = [2]
|
||||||
|
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(token_ids_0 + sep + cls) * [0]
|
||||||
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
|
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
|
|||||||
Reference in New Issue
Block a user