various updates
This commit is contained in:
@@ -409,7 +409,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
|
|||||||
example.text_a,
|
example.text_a,
|
||||||
example.text_b,
|
example.text_b,
|
||||||
add_special_tokens=True,
|
add_special_tokens=True,
|
||||||
output_token_type=True,
|
|
||||||
max_length=max_seq_length,
|
max_length=max_seq_length,
|
||||||
truncate_first_sequence=True # We're truncating the first sequence as a priority
|
truncate_first_sequence=True # We're truncating the first sequence as a priority
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -196,7 +196,7 @@ class CommonTestCases:
|
|||||||
if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
|
if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
|
||||||
seq_0 = "Test this method."
|
seq_0 = "Test this method."
|
||||||
seq_1 = "With these inputs."
|
seq_1 = "With these inputs."
|
||||||
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_token_type=True)
|
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
|
||||||
sequences, mask = information["input_ids"], information["token_type_ids"]
|
sequences, mask = information["input_ids"], information["token_type_ids"]
|
||||||
assert len(sequences) == len(mask)
|
assert len(sequences) == len(mask)
|
||||||
|
|
||||||
|
|||||||
@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, sequence_0, sequence_1):
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A BERT sequence pair mask has the following format:
|
A BERT sequence pair mask has the following format:
|
||||||
@@ -214,7 +214,7 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, vocab_path):
|
def save_vocabulary(self, vocab_path):
|
||||||
"""Save the tokenizer vocabulary to a directory or file."""
|
"""Save the tokenizer vocabulary to a directory or file."""
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, sequence_0, sequence_1):
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A RoBERTa sequence pair mask has the following format:
|
A RoBERTa sequence pair mask has the following format:
|
||||||
@@ -107,4 +107,4 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
return len(cls + self.encode(sequence_0) + sep + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
|
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
@@ -704,13 +704,14 @@ class PreTrainedTokenizer(object):
|
|||||||
to their model.
|
to their model.
|
||||||
**kwargs: passed to the `self.tokenize()` method
|
**kwargs: passed to the `self.tokenize()` method
|
||||||
"""
|
"""
|
||||||
return self.encode_plus(text, text_pair, add_special_tokens, **kwargs)["input_ids"]
|
encoded_inputs = self.encode_plus(text, text_pair=text_pair, add_special_tokens=add_special_tokens, **kwargs)
|
||||||
|
|
||||||
|
return encoded_inputs["input_ids"]
|
||||||
|
|
||||||
def encode_plus(self,
|
def encode_plus(self,
|
||||||
text,
|
text,
|
||||||
text_pair=None,
|
text_pair=None,
|
||||||
add_special_tokens=False,
|
add_special_tokens=False,
|
||||||
output_token_type=False,
|
|
||||||
max_length=None,
|
max_length=None,
|
||||||
stride=0,
|
stride=0,
|
||||||
truncate_first_sequence=True,
|
truncate_first_sequence=True,
|
||||||
@@ -728,8 +729,6 @@ class PreTrainedTokenizer(object):
|
|||||||
`convert_tokens_to_ids` method)
|
`convert_tokens_to_ids` method)
|
||||||
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
|
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
|
||||||
to their model.
|
to their model.
|
||||||
output_token_type: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
|
|
||||||
and 1 for the second.
|
|
||||||
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
|
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
|
||||||
If there are overflowing tokens, those will be added to the returned dictionary
|
If there are overflowing tokens, those will be added to the returned dictionary
|
||||||
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
|
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
|
||||||
@@ -739,133 +738,89 @@ class PreTrainedTokenizer(object):
|
|||||||
**kwargs: passed to the `self.tokenize()` method
|
**kwargs: passed to the `self.tokenize()` method
|
||||||
"""
|
"""
|
||||||
|
|
||||||
information = {}
|
|
||||||
|
|
||||||
def get_input_ids(text):
|
def get_input_ids(text):
|
||||||
if isinstance(text, six.string_types):
|
if isinstance(text, six.string_types):
|
||||||
input_ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
|
return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
|
||||||
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types):
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types):
|
||||||
input_ids = self.convert_tokens_to_ids(text)
|
return self.convert_tokens_to_ids(text)
|
||||||
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
|
||||||
input_ids = text
|
return text
|
||||||
else:
|
else:
|
||||||
raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
|
raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
|
||||||
|
|
||||||
return input_ids
|
first_ids = get_input_ids(text)
|
||||||
|
second_ids = get_input_ids(text_pair) if text_pair is not None else None
|
||||||
|
|
||||||
if text_pair is None:
|
return self.prepare_for_model(first_ids,
|
||||||
sequence_tokens = get_input_ids(text)
|
pair_ids=second_ids,
|
||||||
|
|
||||||
if add_special_tokens:
|
|
||||||
information = self.prepare_for_model(sequence_tokens, max_length=max_length, stride=stride)
|
|
||||||
else:
|
|
||||||
if max_length:
|
|
||||||
information["overflowing_tokens"] = sequence_tokens[max_length - stride:]
|
|
||||||
sequence_tokens = sequence_tokens[:max_length]
|
|
||||||
information["input_ids"] = sequence_tokens
|
|
||||||
|
|
||||||
if output_token_type:
|
|
||||||
information["token_type_ids"] = [0] * len(information["input_ids"])
|
|
||||||
else:
|
|
||||||
first_sentence_tokens = get_input_ids(text)
|
|
||||||
second_sentence_tokens = get_input_ids(text_pair)
|
|
||||||
|
|
||||||
if add_special_tokens:
|
|
||||||
information = self.prepare_pair_for_model(
|
|
||||||
first_sentence_tokens,
|
|
||||||
second_sentence_tokens,
|
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
truncate_first_sequence=truncate_first_sequence,
|
add_special_tokens=add_special_tokens,
|
||||||
stride=stride
|
stride=stride,
|
||||||
)
|
truncate_first_sequence=truncate_first_sequence)
|
||||||
|
|
||||||
if output_token_type:
|
|
||||||
information["token_type_ids"] = self.create_token_type_ids_from_sequences(text, text_pair)
|
|
||||||
else:
|
|
||||||
logger.warning("No special tokens were added. The two sequences have been concatenated.")
|
|
||||||
sequence = first_sentence_tokens + second_sentence_tokens
|
|
||||||
|
|
||||||
if max_length:
|
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, truncate_first_sequence=True):
|
||||||
information["overflowing_tokens"] = sequence[max_length - stride:]
|
|
||||||
sequence = sequence[:max_length]
|
|
||||||
if output_token_type:
|
|
||||||
information["token_type_ids"] = [0] * len(sequence)
|
|
||||||
|
|
||||||
information["input_ids"] = sequence
|
|
||||||
|
|
||||||
return information
|
|
||||||
|
|
||||||
def prepare_for_model(self, ids, max_length=None, stride=0):
|
|
||||||
"""
|
"""
|
||||||
Prepares a list of tokenized input ids so that it can be used by the model. It adds special tokens, truncates
|
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
|
||||||
|
It adds special tokens, truncates
|
||||||
sequences if overflowing while taking into account the special tokens and manages a window stride for
|
sequences if overflowing while taking into account the special tokens and manages a window stride for
|
||||||
overflowing tokens
|
overflowing tokens
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
ids: list of tokenized input ids. Can be obtained from a string by chaining the
|
ids: list of tokenized input ids. Can be obtained from a string by chaining the
|
||||||
`tokenize` and `convert_tokens_to_ids` methods.
|
`tokenize` and `convert_tokens_to_ids` methods.
|
||||||
|
pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
|
||||||
|
`tokenize` and `convert_tokens_to_ids` methods.
|
||||||
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
|
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
|
||||||
|
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
|
||||||
|
to their model.
|
||||||
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
|
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
|
||||||
list of inputs.
|
list of inputs.
|
||||||
|
truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
|
||||||
|
alongside a specified `max_length`, will truncate the first sequence if the total size is superior
|
||||||
|
than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
|
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
|
||||||
"""
|
"""
|
||||||
information = {}
|
pair = bool(pair_ids is not None)
|
||||||
|
len_ids = len(ids)
|
||||||
|
len_pair_ids = len(pair_ids) if pair else 0
|
||||||
|
|
||||||
|
encoded_inputs = {}
|
||||||
if max_length:
|
if max_length:
|
||||||
n_added_tokens = self.num_added_tokens()
|
n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
|
||||||
information["overflowing_tokens"] = ids[max_length - n_added_tokens - stride:]
|
if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
|
||||||
ids = ids[:max_length - n_added_tokens]
|
|
||||||
information["input_ids"] = self.add_special_tokens_single_sequence(ids)
|
|
||||||
|
|
||||||
return information
|
|
||||||
|
|
||||||
def prepare_pair_for_model(self, ids_0, ids_1, max_length=None, truncate_first_sequence=True, stride=0):
|
|
||||||
"""
|
|
||||||
Prepares a list of tokenized input ids pair so that it can be used by the model. It adds special tokens,
|
|
||||||
truncates sequences if overflowing while taking into account the special tokens and manages a window stride for
|
|
||||||
overflowing tokens
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ids_0: list of tokenized input ids. Can be obtained from a string by chaining the
|
|
||||||
`tokenize` and `convert_tokens_to_ids` methods.
|
|
||||||
ids_1: second list of tokenized input ids. Can be obtained from a string by chaining the
|
|
||||||
`tokenize` and `convert_tokens_to_ids` methods.
|
|
||||||
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
|
|
||||||
truncate_first_sequence: if set to `True`, alongside a specified `max_length`, will truncate the first
|
|
||||||
sequence if the total size is superior than the specified `max_length`. If set to `False`, will
|
|
||||||
truncate the second sequence instead.
|
|
||||||
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
|
|
||||||
list of inputs.
|
|
||||||
|
|
||||||
Return:
|
|
||||||
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
|
|
||||||
"""
|
|
||||||
f_len, s_len = len(ids_0), len(ids_1)
|
|
||||||
information = {}
|
|
||||||
|
|
||||||
if max_length:
|
|
||||||
n_added_tokens = self.num_added_tokens(pair=True)
|
|
||||||
if len(ids_0) + n_added_tokens >= max_length:
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"The first sequence is longer than the maximum specified length. This sequence will not be truncated.")
|
"You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
|
||||||
|
"This pair of sequences will not be truncated.")
|
||||||
else:
|
else:
|
||||||
if f_len + s_len + self.num_added_tokens(pair=True) > max_length:
|
if n_added_tokens + len_ids + len_pair_ids > max_length:
|
||||||
if truncate_first_sequence:
|
if truncate_first_sequence or not pair:
|
||||||
information["overflowing_tokens"] = ids_0[max_length - s_len - n_added_tokens - stride:]
|
encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
|
||||||
ids_0 = ids_0[:max_length - s_len - n_added_tokens]
|
ids = ids[:max_length - len_pair_ids - n_added_tokens]
|
||||||
|
elif not truncate_first_sequence and pair:
|
||||||
|
encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
|
||||||
|
pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
|
||||||
else:
|
else:
|
||||||
information["overflowing_tokens"] = ids_1[max_length - f_len - n_added_tokens - stride:]
|
logger.warning(
|
||||||
ids_1 = ids_1[:max_length - f_len - n_added_tokens]
|
"Cannot truncate second sequence as it is not provided. No truncation.")
|
||||||
|
|
||||||
sequence = self.add_special_tokens_sequence_pair(ids_0, ids_1)
|
if add_special_tokens:
|
||||||
information["input_ids"] = sequence
|
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
|
||||||
|
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
|
||||||
|
else:
|
||||||
|
sequence = ids + pair_ids if pair else ids
|
||||||
|
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
|
||||||
|
|
||||||
return information
|
encoded_inputs["input_ids"] = sequence
|
||||||
|
encoded_inputs["token_type_ids"] = token_type_ids
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, sequence_0, sequence_1):
|
return encoded_inputs
|
||||||
|
|
||||||
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
||||||
logger.warning("This tokenizer does not make use of special tokens.")
|
logger.warning("This tokenizer does not make use of special tokens.")
|
||||||
return [0] * len(self.encode(sequence_0)) + [1] * len(self.encode(sequence_1))
|
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def add_special_tokens_single_sequence(self, token_ids):
|
||||||
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
|
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
|
||||||
|
|||||||
@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, sequence_0, sequence_1):
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
An XLM sequence pair mask has the following format:
|
An XLM sequence pair mask has the following format:
|
||||||
@@ -780,7 +780,7 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
"""Save the tokenizer vocabulary and merge files to a directory."""
|
"""Save the tokenizer vocabulary and merge files to a directory."""
|
||||||
|
|||||||
@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return token_ids_0 + sep + token_ids_1 + sep + cls
|
return token_ids_0 + sep + token_ids_1 + sep + cls
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, sequence_0, sequence_1):
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A BERT sequence pair mask has the following format:
|
A BERT sequence pair mask has the following format:
|
||||||
@@ -211,7 +211,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
cls_segment_id = [2]
|
cls_segment_id = [2]
|
||||||
|
|
||||||
return len(self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1] + cls_segment_id
|
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
||||||
|
|||||||
Reference in New Issue
Block a user