Update naming + remove f string in run_lm_finetuning example
This commit is contained in:
@@ -59,7 +59,7 @@ class TextDataset(Dataset):
|
|||||||
def __init__(self, tokenizer, file_path='train', block_size=512):
|
def __init__(self, tokenizer, file_path='train', block_size=512):
|
||||||
assert os.path.isfile(file_path)
|
assert os.path.isfile(file_path)
|
||||||
directory, filename = os.path.split(file_path)
|
directory, filename = os.path.split(file_path)
|
||||||
cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename))
|
cached_features_file = os.path.join(directory, 'cached_lm_' + block_size + '_' + filename)
|
||||||
|
|
||||||
if os.path.exists(cached_features_file):
|
if os.path.exists(cached_features_file):
|
||||||
logger.info("Loading features from cached file %s", cached_features_file)
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
@@ -110,7 +110,7 @@ def mask_tokens(inputs, tokenizer, args):
|
|||||||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
||||||
probability_matrix = torch.full(labels.shape, args.mlm_probability)
|
probability_matrix = torch.full(labels.shape, args.mlm_probability)
|
||||||
probability_matrix *= torch.tensor(
|
probability_matrix *= torch.tensor(
|
||||||
[tokenizer.get_sequence_ids(val, special_tokens_present=True) for val in labels.tolist()],
|
[tokenizer.get_special_tokens_mask(val, special_tokens_present=True) for val in labels.tolist()],
|
||||||
dtype=torch.float
|
dtype=torch.float
|
||||||
)
|
)
|
||||||
masked_indices = torch.bernoulli(probability_matrix).bool()
|
masked_indices = torch.bernoulli(probability_matrix).bool()
|
||||||
|
|||||||
@@ -276,7 +276,7 @@ class CommonTestCases:
|
|||||||
assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
|
assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
|
||||||
assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
|
assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
|
||||||
|
|
||||||
def test_sequence_ids(self):
|
def test_special_tokens_mask(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
sequence_0 = "Encode this."
|
sequence_0 = "Encode this."
|
||||||
@@ -286,10 +286,10 @@ class CommonTestCases:
|
|||||||
encoded_sequence = tokenizer.encode(sequence_0)
|
encoded_sequence = tokenizer.encode(sequence_0)
|
||||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
||||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||||
sequence_ids = encoded_sequence_dict["sequence_ids"]
|
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||||
assert len(sequence_ids) == len(encoded_sequence_w_special)
|
assert len(special_tokens_mask) == len(encoded_sequence_w_special)
|
||||||
|
|
||||||
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
||||||
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
||||||
assert encoded_sequence == filtered_sequence
|
assert encoded_sequence == filtered_sequence
|
||||||
|
|
||||||
@@ -297,10 +297,10 @@ class CommonTestCases:
|
|||||||
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
|
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
|
||||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
|
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
|
||||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||||
sequence_ids = encoded_sequence_dict["sequence_ids"]
|
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||||
assert len(sequence_ids) == len(encoded_sequence_w_special)
|
assert len(special_tokens_mask) == len(encoded_sequence_w_special)
|
||||||
|
|
||||||
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
||||||
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
||||||
assert encoded_sequence == filtered_sequence
|
assert encoded_sequence == filtered_sequence
|
||||||
|
|
||||||
@@ -309,10 +309,10 @@ class CommonTestCases:
|
|||||||
tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
|
tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
|
||||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
||||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||||
sequence_ids_orig = encoded_sequence_dict["sequence_ids"]
|
special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
|
||||||
sequence_ids = tokenizer.get_sequence_ids(encoded_sequence_w_special, special_tokens_present=True)
|
special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, special_tokens_present=True)
|
||||||
assert len(sequence_ids) == len(encoded_sequence_w_special)
|
assert len(special_tokens_mask) == len(encoded_sequence_w_special)
|
||||||
assert sequence_ids_orig == sequence_ids
|
assert special_tokens_mask_orig == special_tokens_mask
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|||||||
@@ -100,7 +100,7 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|||||||
@@ -820,7 +820,7 @@ class PreTrainedTokenizer(object):
|
|||||||
{
|
{
|
||||||
input_ids: list[int],
|
input_ids: list[int],
|
||||||
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
|
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
|
||||||
sequence_ids: list[int] if ``add_special_tokens`` if set to ``True``
|
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True``
|
||||||
}
|
}
|
||||||
|
|
||||||
With the fields:
|
With the fields:
|
||||||
@@ -828,7 +828,7 @@ class PreTrainedTokenizer(object):
|
|||||||
|
|
||||||
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
|
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
|
||||||
|
|
||||||
``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
|
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
|
||||||
tokens and 1 specifying sequence tokens.
|
tokens and 1 specifying sequence tokens.
|
||||||
"""
|
"""
|
||||||
pair = bool(pair_ids is not None)
|
pair = bool(pair_ids is not None)
|
||||||
@@ -857,7 +857,7 @@ class PreTrainedTokenizer(object):
|
|||||||
if add_special_tokens:
|
if add_special_tokens:
|
||||||
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
|
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
|
||||||
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
|
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
|
||||||
encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids)
|
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
|
||||||
else:
|
else:
|
||||||
sequence = ids + pair_ids if pair else ids
|
sequence = ids + pair_ids if pair else ids
|
||||||
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
|
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
|
||||||
@@ -877,6 +877,7 @@ class PreTrainedTokenizer(object):
|
|||||||
if max_length and len(encoded_inputs["input_ids"]) > max_length:
|
if max_length and len(encoded_inputs["input_ids"]) > max_length:
|
||||||
encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
|
encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
|
||||||
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
|
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
|
||||||
|
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
|
||||||
|
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
@@ -892,7 +893,7 @@ class PreTrainedTokenizer(object):
|
|||||||
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
|
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
|
||||||
return token_ids_0 + token_ids_1
|
return token_ids_0 + token_ids_1
|
||||||
|
|
||||||
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
||||||
return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
|
return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
|
||||||
|
|
||||||
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
||||||
|
|||||||
@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|||||||
@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return token_ids_0 + sep + token_ids_1 + sep + cls
|
return token_ids_0 + sep + token_ids_1 + sep + cls
|
||||||
|
|
||||||
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|||||||
Reference in New Issue
Block a user