Update naming + remove f string in run_lm_finetuning example

This commit is contained in:
LysandreJik
2019-10-02 18:04:38 -04:00
parent 651bfb7ad5
commit aebd83230f
7 changed files with 22 additions and 21 deletions

View File

@@ -59,7 +59,7 @@ class TextDataset(Dataset):
def __init__(self, tokenizer, file_path='train', block_size=512): def __init__(self, tokenizer, file_path='train', block_size=512):
assert os.path.isfile(file_path) assert os.path.isfile(file_path)
directory, filename = os.path.split(file_path) directory, filename = os.path.split(file_path)
cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename)) cached_features_file = os.path.join(directory, 'cached_lm_' + block_size + '_' + filename)
if os.path.exists(cached_features_file): if os.path.exists(cached_features_file):
logger.info("Loading features from cached file %s", cached_features_file) logger.info("Loading features from cached file %s", cached_features_file)
@@ -110,7 +110,7 @@ def mask_tokens(inputs, tokenizer, args):
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
probability_matrix = torch.full(labels.shape, args.mlm_probability) probability_matrix = torch.full(labels.shape, args.mlm_probability)
probability_matrix *= torch.tensor( probability_matrix *= torch.tensor(
[tokenizer.get_sequence_ids(val, special_tokens_present=True) for val in labels.tolist()], [tokenizer.get_special_tokens_mask(val, special_tokens_present=True) for val in labels.tolist()],
dtype=torch.float dtype=torch.float
) )
masked_indices = torch.bernoulli(probability_matrix).bool() masked_indices = torch.bernoulli(probability_matrix).bool()

View File

@@ -276,7 +276,7 @@ class CommonTestCases:
assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
def test_sequence_ids(self): def test_special_tokens_mask(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
sequence_0 = "Encode this." sequence_0 = "Encode this."
@@ -286,10 +286,10 @@ class CommonTestCases:
encoded_sequence = tokenizer.encode(sequence_0) encoded_sequence = tokenizer.encode(sequence_0)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"] encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
sequence_ids = encoded_sequence_dict["sequence_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
assert len(sequence_ids) == len(encoded_sequence_w_special) assert len(special_tokens_mask) == len(encoded_sequence_w_special)
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [x for x in filtered_sequence if x is not None] filtered_sequence = [x for x in filtered_sequence if x is not None]
assert encoded_sequence == filtered_sequence assert encoded_sequence == filtered_sequence
@@ -297,10 +297,10 @@ class CommonTestCases:
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1) encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"] encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
sequence_ids = encoded_sequence_dict["sequence_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
assert len(sequence_ids) == len(encoded_sequence_w_special) assert len(special_tokens_mask) == len(encoded_sequence_w_special)
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [x for x in filtered_sequence if x is not None] filtered_sequence = [x for x in filtered_sequence if x is not None]
assert encoded_sequence == filtered_sequence assert encoded_sequence == filtered_sequence
@@ -309,10 +309,10 @@ class CommonTestCases:
tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'}) tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"] encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
sequence_ids_orig = encoded_sequence_dict["sequence_ids"] special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
sequence_ids = tokenizer.get_sequence_ids(encoded_sequence_w_special, special_tokens_present=True) special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, special_tokens_present=True)
assert len(sequence_ids) == len(encoded_sequence_w_special) assert len(special_tokens_mask) == len(encoded_sequence_w_special)
assert sequence_ids_orig == sequence_ids assert special_tokens_mask_orig == special_tokens_mask

View File

@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.

View File

@@ -100,7 +100,7 @@ class RobertaTokenizer(GPT2Tokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.

View File

@@ -820,7 +820,7 @@ class PreTrainedTokenizer(object):
{ {
input_ids: list[int], input_ids: list[int],
overflowing_tokens: list[int] if a ``max_length`` is specified, else None overflowing_tokens: list[int] if a ``max_length`` is specified, else None
sequence_ids: list[int] if ``add_special_tokens`` if set to ``True`` special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True``
} }
With the fields: With the fields:
@@ -828,7 +828,7 @@ class PreTrainedTokenizer(object):
``overflowing_tokens``: list of overflowing tokens if a max length is specified. ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens. tokens and 1 specifying sequence tokens.
""" """
pair = bool(pair_ids is not None) pair = bool(pair_ids is not None)
@@ -857,7 +857,7 @@ class PreTrainedTokenizer(object):
if add_special_tokens: if add_special_tokens:
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids) sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids) encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
else: else:
sequence = ids + pair_ids if pair else ids sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
@@ -877,6 +877,7 @@ class PreTrainedTokenizer(object):
if max_length and len(encoded_inputs["input_ids"]) > max_length: if max_length and len(encoded_inputs["input_ids"]) > max_length:
encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length] encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length] encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
return encoded_inputs return encoded_inputs
@@ -892,7 +893,7 @@ class PreTrainedTokenizer(object):
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
return token_ids_0 + token_ids_1 return token_ids_0 + token_ids_1
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
def convert_ids_to_tokens(self, ids, skip_special_tokens=False): def convert_ids_to_tokens(self, ids, skip_special_tokens=False):

View File

@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.

View File

@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return token_ids_0 + sep + token_ids_1 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls
def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.