From 7d709e55ed54961ce3c84f53f1c14ee4f0c8a2e3 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 22 Oct 2019 14:12:33 -0400 Subject: [PATCH] Remove --- examples/benchmarks.py | 4 +-- .../distillation/scripts/binarized_data.py | 2 +- examples/run_generation.py | 2 +- transformers/tests/tokenization_bert_test.py | 4 +-- .../tests/tokenization_distilbert_test.py | 4 +-- .../tests/tokenization_roberta_test.py | 8 +++--- .../tests/tokenization_tests_commons.py | 28 ++++++++++--------- transformers/tests/tokenization_xlm_test.py | 4 +-- transformers/tests/tokenization_xlnet_test.py | 4 +-- transformers/tokenization_utils.py | 20 ++++++------- 10 files changed, 41 insertions(+), 39 deletions(-) diff --git a/examples/benchmarks.py b/examples/benchmarks.py index d03844697d..06f368d946 100644 --- a/examples/benchmarks.py +++ b/examples/benchmarks.py @@ -309,7 +309,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript) model = AutoModel.from_pretrained(model_name, config=config) tokenizer = AutoTokenizer.from_pretrained(model_name) - tokenized_sequence = tokenizer.encode(input_text) + tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False) max_input_size = tokenizer.max_model_input_sizes[model_name] batch_sizes = [1, 2, 4, 8] @@ -353,7 +353,7 @@ def _compute_tensorflow(model_names, dictionary, average_over): model = TFAutoModel.from_pretrained(model_name, config=config) tokenizer = AutoTokenizer.from_pretrained(model_name) - tokenized_sequence = tokenizer.encode(input_text) + tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False) max_input_size = tokenizer.max_model_input_sizes[model_name] batch_sizes = [1, 2, 4, 8] diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py index 43824e9964..681cc2de34 100644 --- a/examples/distillation/scripts/binarized_data.py +++ b/examples/distillation/scripts/binarized_data.py @@ -68,7 +68,7 @@ def main(): start = time.time() for text in data: text = f'{bos} {text.strip()} {sep}' - token_ids = tokenizer.encode(text) + token_ids = tokenizer.encode(text, add_special_tokens=False) rslt.append(token_ids) iter += 1 diff --git a/examples/run_generation.py b/examples/run_generation.py index ef58cfd844..b7907e40da 100644 --- a/examples/run_generation.py +++ b/examples/run_generation.py @@ -223,7 +223,7 @@ def main(): if args.model_type in ["transfo-xl", "xlnet"]: # Models with memory likes to have a long prompt for short inputs. raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text - context_tokens = tokenizer.encode(raw_text) + context_tokens = tokenizer.encode(raw_text, add_special_tokens=False) out = sample_sequence( model=model, context=context_tokens, diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py index 5e49e2915b..fd61ec30ba 100644 --- a/transformers/tests/tokenization_bert_test.py +++ b/transformers/tests/tokenization_bert_test.py @@ -128,8 +128,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): def test_sequence_builders(self): tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased") - text = tokenizer.encode("sequence builders") - text_2 = tokenizer.encode("multi-sequence build") + text = tokenizer.encode("sequence builders", add_special_tokens=False) + text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py index a18d644fe8..e3c8376ca8 100644 --- a/transformers/tests/tokenization_distilbert_test.py +++ b/transformers/tests/tokenization_distilbert_test.py @@ -33,8 +33,8 @@ class DistilBertTokenizationTest(BertTokenizationTest): def test_sequence_builders(self): tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") - text = tokenizer.encode("sequence builders") - text_2 = tokenizer.encode("multi-sequence build") + text = tokenizer.encode("sequence builders", add_special_tokens=False) + text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) diff --git a/transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py index a731ac26c9..b31dd94f21 100644 --- a/transformers/tests/tokenization_roberta_test.py +++ b/transformers/tests/tokenization_roberta_test.py @@ -70,19 +70,19 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): tokenizer = self.get_tokenizer() self.assertListEqual( - tokenizer.encode('Hello world!'), + tokenizer.encode('Hello world!', add_special_tokens=False), [0, 31414, 232, 328, 2] ) self.assertListEqual( - tokenizer.encode('Hello world! cécé herlolip 418'), + tokenizer.encode('Hello world! cécé herlolip 418', add_special_tokens=False), [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2] ) def test_sequence_builders(self): tokenizer = RobertaTokenizer.from_pretrained("roberta-base") - text = tokenizer.encode("sequence builders") - text_2 = tokenizer.encode("multi-sequence build") + text = tokenizer.encode("sequence builders", add_special_tokens=False) + text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True) encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index b2801d5f41..a921696b77 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -79,13 +79,13 @@ class CommonTestCases: # Now let's start the test tokenizer = self.get_tokenizer(max_len=42) - before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running") + before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False) with TemporaryDirectory() as tmpdirname: tokenizer.save_pretrained(tmpdirname) tokenizer = self.tokenizer_class.from_pretrained(tmpdirname) - after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running") + after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False) self.assertListEqual(before_tokens, after_tokens) self.assertEqual(tokenizer.max_len, 42) @@ -130,7 +130,7 @@ class CommonTestCases: self.assertEqual(added_toks, len(new_toks)) self.assertEqual(all_size_2, all_size + len(new_toks)) - tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l") + tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False) out_string = tokenizer.decode(tokens) self.assertGreaterEqual(len(tokens), 4) @@ -148,7 +148,8 @@ class CommonTestCases: self.assertEqual(added_toks_2, len(new_toks_2)) self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) - tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l") + tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", + add_special_tokens=False) out_string = tokenizer.decode(tokens) self.assertGreaterEqual(len(tokens), 6) @@ -166,7 +167,7 @@ class CommonTestCases: tokens = tokenizer.tokenize(input_text) ids = tokenizer.convert_tokens_to_ids(tokens) - ids_2 = tokenizer.encode(input_text) + ids_2 = tokenizer.encode(input_text, add_special_tokens=False) self.assertListEqual(ids, ids_2) tokens_2 = tokenizer.convert_ids_to_tokens(ids) @@ -206,7 +207,7 @@ class CommonTestCases: seq_0 = "Test this method." seq_1 = "With these inputs." - sequences = tokenizer.encode(seq_0, seq_1) + sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False) attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) # Method is implemented (e.g. not GPT-2) @@ -219,7 +220,7 @@ class CommonTestCases: seq_0 = "This is a sentence to be encoded." stride = 2 - sequence = tokenizer.encode(seq_0) + sequence = tokenizer.encode(seq_0, add_special_tokens=False) num_added_tokens = tokenizer.num_added_tokens() total_length = len(sequence) + num_added_tokens information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride) @@ -239,13 +240,13 @@ class CommonTestCases: seq_1 = "This is another sentence to be encoded." stride = 2 - sequence_0_no_special_tokens = tokenizer.encode(seq_0) - sequence_1_no_special_tokens = tokenizer.encode(seq_1) + sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False) + sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False) sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) truncated_second_sequence = tokenizer.build_inputs_with_special_tokens( - tokenizer.encode(seq_0), - tokenizer.encode(seq_1)[:-2] + tokenizer.encode(seq_0, add_special_tokens=False), + tokenizer.encode(seq_1, add_special_tokens=False)[:-2] ) information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True, @@ -283,7 +284,7 @@ class CommonTestCases: sequence_1 = "This one too please." # Testing single inputs - encoded_sequence = tokenizer.encode(sequence_0) + encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] @@ -294,7 +295,8 @@ class CommonTestCases: self.assertEqual(encoded_sequence, filtered_sequence) # Testing inputs pairs - encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1) + encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(sequence_1, + add_special_tokens=False) encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] diff --git a/transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py index 0949b0cce4..567edf1ccd 100644 --- a/transformers/tests/tokenization_xlm_test.py +++ b/transformers/tests/tokenization_xlm_test.py @@ -69,8 +69,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): def test_sequence_builders(self): tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048") - text = tokenizer.encode("sequence builders") - text_2 = tokenizer.encode("multi-sequence build") + text = tokenizer.encode("sequence builders", add_special_tokens=False) + text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) diff --git a/transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py index 1a5dbcf6df..653968b9af 100644 --- a/transformers/tests/tokenization_xlnet_test.py +++ b/transformers/tests/tokenization_xlnet_test.py @@ -92,8 +92,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester): def test_sequence_builders(self): tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") - text = tokenizer.encode("sequence builders") - text_2 = tokenizer.encode("multi-sequence build") + text = tokenizer.encode("sequence builders", add_special_tokens=False) + text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 5e5be872ef..ac765165e2 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -689,14 +689,14 @@ class PreTrainedTokenizer(object): raise NotImplementedError def encode(self, - text, - text_pair=None, - add_special_tokens=False, - max_length=None, - stride=0, - truncation_strategy='longest_first', - return_tensors=None, - **kwargs): + text, + text_pair=None, + add_special_tokens=True, + max_length=None, + stride=0, + truncation_strategy='longest_first', + return_tensors=None, + **kwargs): """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. @@ -739,7 +739,7 @@ class PreTrainedTokenizer(object): def encode_plus(self, text, text_pair=None, - add_special_tokens=False, + add_special_tokens=True, max_length=None, stride=0, truncation_strategy='longest_first', @@ -794,7 +794,7 @@ class PreTrainedTokenizer(object): truncation_strategy=truncation_strategy, return_tensors=return_tensors) - def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, + def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0, truncation_strategy='longest_first', return_tensors=None): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.