Mask computing in standalone method. Tests.
This commit is contained in:
@@ -187,18 +187,18 @@ class CommonTestCases:
|
|||||||
for weights_list_2 in weights_lists_2:
|
for weights_list_2 in weights_lists_2:
|
||||||
self.assertListEqual(weights_list, weights_list_2)
|
self.assertListEqual(weights_list, weights_list_2)
|
||||||
|
|
||||||
# def test_mask_output(self):
|
def test_mask_output(self):
|
||||||
# if sys.version_info <= (3, 0):
|
if sys.version_info <= (3, 0):
|
||||||
# return
|
return
|
||||||
#
|
|
||||||
# tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
#
|
|
||||||
# if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
|
if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
|
||||||
# seq_0 = "Test this method."
|
seq_0 = "Test this method."
|
||||||
# seq_1 = "With these inputs."
|
seq_1 = "With these inputs."
|
||||||
# information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True)
|
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True)
|
||||||
# sequences, mask = information["sequence"], information["mask"]
|
sequences, mask = information["sequence"], information["mask"]
|
||||||
# assert len(sequences) == len(mask)
|
assert len(sequences) == len(mask)
|
||||||
|
|
||||||
def test_number_of_added_tokens(self):
|
def test_number_of_added_tokens(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|||||||
@@ -204,6 +204,18 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
|
def create_mask_from_sequences(self, sequence_0, sequence_1):
|
||||||
|
"""
|
||||||
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
|
A BERT sequence pair mask has the following format:
|
||||||
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
|
| first sequence | second sequence
|
||||||
|
"""
|
||||||
|
sep = [self.sep_token_id]
|
||||||
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
|
return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, vocab_path):
|
def save_vocabulary(self, vocab_path):
|
||||||
"""Save the tokenizer vocabulary to a directory or file."""
|
"""Save the tokenizer vocabulary to a directory or file."""
|
||||||
index = 0
|
index = 0
|
||||||
|
|||||||
@@ -64,12 +64,18 @@ class DistilBertTokenizer(BertTokenizer):
|
|||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def add_special_tokens_single_sequence(self, token_ids):
|
||||||
return token_ids
|
return token_ids
|
||||||
|
|
||||||
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1, output_mask=False):
|
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
if output_mask:
|
|
||||||
return (
|
|
||||||
token_ids_0 + sep + token_ids_1,
|
|
||||||
[0] * len(token_ids_0 + sep) + [1] * len(token_ids_1)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
return token_ids_0 + sep + token_ids_1
|
return token_ids_0 + sep + token_ids_1
|
||||||
|
|
||||||
|
def create_mask_from_sequences(self, sequence_0, sequence_1):
|
||||||
|
"""
|
||||||
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
|
A BERT sequence pair mask has the following format:
|
||||||
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
|
| first sequence | second sequence
|
||||||
|
"""
|
||||||
|
sep = [self.sep_token_id]
|
||||||
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
|
return len(self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1)) * [1]
|
||||||
|
|||||||
@@ -96,3 +96,15 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||||
|
|
||||||
|
def create_mask_from_sequences(self, sequence_0, sequence_1):
|
||||||
|
"""
|
||||||
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
|
A RoBERTa sequence pair mask has the following format:
|
||||||
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
|
| first sequence | second sequence
|
||||||
|
"""
|
||||||
|
sep = [self.sep_token_id]
|
||||||
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
|
return len(cls + self.encode(sequence_0) + sep + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
|
||||||
@@ -779,8 +779,8 @@ class PreTrainedTokenizer(object):
|
|||||||
second_sentence_tokens
|
second_sentence_tokens
|
||||||
)
|
)
|
||||||
|
|
||||||
# if output_mask:
|
if output_mask:
|
||||||
# sequence, information["mask"] = encoded_sequence
|
information["mask"] = self.create_mask_from_sequences(text, text_pair)
|
||||||
|
|
||||||
information["sequence"] = sequence
|
information["sequence"] = sequence
|
||||||
else:
|
else:
|
||||||
@@ -797,6 +797,10 @@ class PreTrainedTokenizer(object):
|
|||||||
|
|
||||||
return information
|
return information
|
||||||
|
|
||||||
|
def create_mask_from_sequences(self, sequence_0, sequence_1):
|
||||||
|
logger.warning("This tokenizer does not make use of special tokens.")
|
||||||
|
return [0] * len(self.encode(sequence_0)) + [1] * len(self.encode(sequence_1))
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def add_special_tokens_single_sequence(self, token_ids):
|
||||||
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
|
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
|
||||||
return token_ids
|
return token_ids
|
||||||
|
|||||||
@@ -770,6 +770,18 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
|
def create_mask_from_sequences(self, sequence_0, sequence_1):
|
||||||
|
"""
|
||||||
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
|
An XLM sequence pair mask has the following format:
|
||||||
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
|
| first sequence | second sequence
|
||||||
|
"""
|
||||||
|
sep = [self.sep_token_id]
|
||||||
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
|
return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
"""Save the tokenizer vocabulary and merge files to a directory."""
|
"""Save the tokenizer vocabulary and merge files to a directory."""
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
|
|||||||
@@ -198,9 +198,21 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
cls_segment_ids = [2]
|
|
||||||
return token_ids_0 + sep + token_ids_1 + sep + cls
|
return token_ids_0 + sep + token_ids_1 + sep + cls
|
||||||
|
|
||||||
|
def create_mask_from_sequences(self, sequence_0, sequence_1):
|
||||||
|
"""
|
||||||
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
|
A BERT sequence pair mask has the following format:
|
||||||
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
|
||||||
|
| first sequence | second sequence | CLS segment ID
|
||||||
|
"""
|
||||||
|
sep = [self.sep_token_id]
|
||||||
|
cls = [self.cls_token_id]
|
||||||
|
cls_segment_id = [2]
|
||||||
|
|
||||||
|
return len(self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1] + cls_segment_id
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
||||||
to a directory.
|
to a directory.
|
||||||
|
|||||||
Reference in New Issue
Block a user