From aebd83230f4a70df386ea2104c21ec46e9320f7f Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 2 Oct 2019 18:04:38 -0400 Subject: [PATCH] Update naming + remove f string in run_lm_finetuning example --- examples/run_lm_finetuning.py | 4 ++-- .../tests/tokenization_tests_commons.py | 22 +++++++++---------- transformers/tokenization_bert.py | 2 +- transformers/tokenization_roberta.py | 2 +- transformers/tokenization_utils.py | 9 ++++---- transformers/tokenization_xlm.py | 2 +- transformers/tokenization_xlnet.py | 2 +- 7 files changed, 22 insertions(+), 21 deletions(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 024b254b56..585bbc8d75 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -59,7 +59,7 @@ class TextDataset(Dataset): def __init__(self, tokenizer, file_path='train', block_size=512): assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) - cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename)) + cached_features_file = os.path.join(directory, 'cached_lm_' + block_size + '_' + filename) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) @@ -110,7 +110,7 @@ def mask_tokens(inputs, tokenizer, args): # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) probability_matrix *= torch.tensor( - [tokenizer.get_sequence_ids(val, special_tokens_present=True) for val in labels.tolist()], + [tokenizer.get_special_tokens_mask(val, special_tokens_present=True) for val in labels.tolist()], dtype=torch.float ) masked_indices = torch.bernoulli(probability_matrix).bool() diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index f4805598cd..c902347fe5 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -276,7 +276,7 @@ class CommonTestCases: assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input - def test_sequence_ids(self): + def test_special_tokens_mask(self): tokenizer = self.get_tokenizer() sequence_0 = "Encode this." @@ -286,10 +286,10 @@ class CommonTestCases: encoded_sequence = tokenizer.encode(sequence_0) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] - sequence_ids = encoded_sequence_dict["sequence_ids"] - assert len(sequence_ids) == len(encoded_sequence_w_special) + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + assert len(special_tokens_mask) == len(encoded_sequence_w_special) - filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)] filtered_sequence = [x for x in filtered_sequence if x is not None] assert encoded_sequence == filtered_sequence @@ -297,10 +297,10 @@ class CommonTestCases: encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] - sequence_ids = encoded_sequence_dict["sequence_ids"] - assert len(sequence_ids) == len(encoded_sequence_w_special) + special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] + assert len(special_tokens_mask) == len(encoded_sequence_w_special) - filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [(x if special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)] filtered_sequence = [x for x in filtered_sequence if x is not None] assert encoded_sequence == filtered_sequence @@ -309,10 +309,10 @@ class CommonTestCases: tokenizer.add_special_tokens({'cls_token': '', 'sep_token': ''}) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] - sequence_ids_orig = encoded_sequence_dict["sequence_ids"] - sequence_ids = tokenizer.get_sequence_ids(encoded_sequence_w_special, special_tokens_present=True) - assert len(sequence_ids) == len(encoded_sequence_w_special) - assert sequence_ids_orig == sequence_ids + special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"] + special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, special_tokens_present=True) + assert len(special_tokens_mask) == len(encoded_sequence_w_special) + assert special_tokens_mask_orig == special_tokens_mask diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index 72720bfa4e..cf6ab51d02 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer): return cls + token_ids_0 + sep + token_ids_1 + sep - def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py index 6f19ee699c..ceb1245169 100644 --- a/transformers/tokenization_roberta.py +++ b/transformers/tokenization_roberta.py @@ -100,7 +100,7 @@ class RobertaTokenizer(GPT2Tokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep - def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 527ee8422d..bb69df8698 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -820,7 +820,7 @@ class PreTrainedTokenizer(object): { input_ids: list[int], overflowing_tokens: list[int] if a ``max_length`` is specified, else None - sequence_ids: list[int] if ``add_special_tokens`` if set to ``True`` + special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` } With the fields: @@ -828,7 +828,7 @@ class PreTrainedTokenizer(object): ``overflowing_tokens``: list of overflowing tokens if a max length is specified. - ``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added tokens and 1 specifying sequence tokens. """ pair = bool(pair_ids is not None) @@ -857,7 +857,7 @@ class PreTrainedTokenizer(object): if add_special_tokens: sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence) - encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids) + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) else: sequence = ids + pair_ids if pair else ids token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) @@ -877,6 +877,7 @@ class PreTrainedTokenizer(object): if max_length and len(encoded_inputs["input_ids"]) > max_length: encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length] encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length] + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length] return encoded_inputs @@ -892,7 +893,7 @@ class PreTrainedTokenizer(object): logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") return token_ids_0 + token_ids_1 - def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) def convert_ids_to_tokens(self, ids, skip_special_tokens=False): diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py index 3f398853dc..817c6d8c44 100644 --- a/transformers/tokenization_xlm.py +++ b/transformers/tokenization_xlm.py @@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return cls + token_ids_0 + sep + token_ids_1 + sep - def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py index 325a32ef6f..37f975abef 100644 --- a/transformers/tokenization_xlnet.py +++ b/transformers/tokenization_xlnet.py @@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer): cls = [self.cls_token_id] return token_ids_0 + sep + token_ids_1 + sep + cls - def get_sequence_ids(self, token_ids_0, token_ids_1=None, special_tokens_present=False): + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.