Remove
This commit is contained in:
@@ -309,7 +309,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript)
|
|||||||
model = AutoModel.from_pretrained(model_name, config=config)
|
model = AutoModel.from_pretrained(model_name, config=config)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
tokenized_sequence = tokenizer.encode(input_text)
|
tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
|
||||||
|
|
||||||
max_input_size = tokenizer.max_model_input_sizes[model_name]
|
max_input_size = tokenizer.max_model_input_sizes[model_name]
|
||||||
batch_sizes = [1, 2, 4, 8]
|
batch_sizes = [1, 2, 4, 8]
|
||||||
@@ -353,7 +353,7 @@ def _compute_tensorflow(model_names, dictionary, average_over):
|
|||||||
model = TFAutoModel.from_pretrained(model_name, config=config)
|
model = TFAutoModel.from_pretrained(model_name, config=config)
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
tokenized_sequence = tokenizer.encode(input_text)
|
tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
|
||||||
|
|
||||||
max_input_size = tokenizer.max_model_input_sizes[model_name]
|
max_input_size = tokenizer.max_model_input_sizes[model_name]
|
||||||
batch_sizes = [1, 2, 4, 8]
|
batch_sizes = [1, 2, 4, 8]
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ def main():
|
|||||||
start = time.time()
|
start = time.time()
|
||||||
for text in data:
|
for text in data:
|
||||||
text = f'{bos} {text.strip()} {sep}'
|
text = f'{bos} {text.strip()} {sep}'
|
||||||
token_ids = tokenizer.encode(text)
|
token_ids = tokenizer.encode(text, add_special_tokens=False)
|
||||||
rslt.append(token_ids)
|
rslt.append(token_ids)
|
||||||
|
|
||||||
iter += 1
|
iter += 1
|
||||||
|
|||||||
@@ -223,7 +223,7 @@ def main():
|
|||||||
if args.model_type in ["transfo-xl", "xlnet"]:
|
if args.model_type in ["transfo-xl", "xlnet"]:
|
||||||
# Models with memory likes to have a long prompt for short inputs.
|
# Models with memory likes to have a long prompt for short inputs.
|
||||||
raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text
|
raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text
|
||||||
context_tokens = tokenizer.encode(raw_text)
|
context_tokens = tokenizer.encode(raw_text, add_special_tokens=False)
|
||||||
out = sample_sequence(
|
out = sample_sequence(
|
||||||
model=model,
|
model=model,
|
||||||
context=context_tokens,
|
context=context_tokens,
|
||||||
|
|||||||
@@ -128,8 +128,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
def test_sequence_builders(self):
|
def test_sequence_builders(self):
|
||||||
tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
|
tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
|
||||||
|
|
||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders", add_special_tokens=False)
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
|
||||||
|
|
||||||
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|||||||
@@ -33,8 +33,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
|
|||||||
def test_sequence_builders(self):
|
def test_sequence_builders(self):
|
||||||
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
||||||
|
|
||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders", add_special_tokens=False)
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
|
||||||
|
|
||||||
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|||||||
@@ -70,19 +70,19 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.encode('Hello world!'),
|
tokenizer.encode('Hello world!', add_special_tokens=False),
|
||||||
[0, 31414, 232, 328, 2]
|
[0, 31414, 232, 328, 2]
|
||||||
)
|
)
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.encode('Hello world! cécé herlolip 418'),
|
tokenizer.encode('Hello world! cécé herlolip 418', add_special_tokens=False),
|
||||||
[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
|
[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_sequence_builders(self):
|
def test_sequence_builders(self):
|
||||||
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
||||||
|
|
||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders", add_special_tokens=False)
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
|
||||||
|
|
||||||
encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
|
encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
|
||||||
encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
|
encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
|
||||||
|
|||||||
@@ -79,13 +79,13 @@ class CommonTestCases:
|
|||||||
# Now let's start the test
|
# Now let's start the test
|
||||||
tokenizer = self.get_tokenizer(max_len=42)
|
tokenizer = self.get_tokenizer(max_len=42)
|
||||||
|
|
||||||
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
|
||||||
|
|
||||||
with TemporaryDirectory() as tmpdirname:
|
with TemporaryDirectory() as tmpdirname:
|
||||||
tokenizer.save_pretrained(tmpdirname)
|
tokenizer.save_pretrained(tmpdirname)
|
||||||
tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
|
tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
|
||||||
self.assertListEqual(before_tokens, after_tokens)
|
self.assertListEqual(before_tokens, after_tokens)
|
||||||
|
|
||||||
self.assertEqual(tokenizer.max_len, 42)
|
self.assertEqual(tokenizer.max_len, 42)
|
||||||
@@ -130,7 +130,7 @@ class CommonTestCases:
|
|||||||
self.assertEqual(added_toks, len(new_toks))
|
self.assertEqual(added_toks, len(new_toks))
|
||||||
self.assertEqual(all_size_2, all_size + len(new_toks))
|
self.assertEqual(all_size_2, all_size + len(new_toks))
|
||||||
|
|
||||||
tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l")
|
tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
|
||||||
out_string = tokenizer.decode(tokens)
|
out_string = tokenizer.decode(tokens)
|
||||||
|
|
||||||
self.assertGreaterEqual(len(tokens), 4)
|
self.assertGreaterEqual(len(tokens), 4)
|
||||||
@@ -148,7 +148,8 @@ class CommonTestCases:
|
|||||||
self.assertEqual(added_toks_2, len(new_toks_2))
|
self.assertEqual(added_toks_2, len(new_toks_2))
|
||||||
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
||||||
|
|
||||||
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
|
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
|
||||||
|
add_special_tokens=False)
|
||||||
out_string = tokenizer.decode(tokens)
|
out_string = tokenizer.decode(tokens)
|
||||||
|
|
||||||
self.assertGreaterEqual(len(tokens), 6)
|
self.assertGreaterEqual(len(tokens), 6)
|
||||||
@@ -166,7 +167,7 @@ class CommonTestCases:
|
|||||||
|
|
||||||
tokens = tokenizer.tokenize(input_text)
|
tokens = tokenizer.tokenize(input_text)
|
||||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||||
ids_2 = tokenizer.encode(input_text)
|
ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
|
||||||
self.assertListEqual(ids, ids_2)
|
self.assertListEqual(ids, ids_2)
|
||||||
|
|
||||||
tokens_2 = tokenizer.convert_ids_to_tokens(ids)
|
tokens_2 = tokenizer.convert_ids_to_tokens(ids)
|
||||||
@@ -206,7 +207,7 @@ class CommonTestCases:
|
|||||||
seq_0 = "Test this method."
|
seq_0 = "Test this method."
|
||||||
seq_1 = "With these inputs."
|
seq_1 = "With these inputs."
|
||||||
|
|
||||||
sequences = tokenizer.encode(seq_0, seq_1)
|
sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
|
||||||
attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
|
attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
|
||||||
|
|
||||||
# Method is implemented (e.g. not GPT-2)
|
# Method is implemented (e.g. not GPT-2)
|
||||||
@@ -219,7 +220,7 @@ class CommonTestCases:
|
|||||||
seq_0 = "This is a sentence to be encoded."
|
seq_0 = "This is a sentence to be encoded."
|
||||||
stride = 2
|
stride = 2
|
||||||
|
|
||||||
sequence = tokenizer.encode(seq_0)
|
sequence = tokenizer.encode(seq_0, add_special_tokens=False)
|
||||||
num_added_tokens = tokenizer.num_added_tokens()
|
num_added_tokens = tokenizer.num_added_tokens()
|
||||||
total_length = len(sequence) + num_added_tokens
|
total_length = len(sequence) + num_added_tokens
|
||||||
information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride)
|
information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride)
|
||||||
@@ -239,13 +240,13 @@ class CommonTestCases:
|
|||||||
seq_1 = "This is another sentence to be encoded."
|
seq_1 = "This is another sentence to be encoded."
|
||||||
stride = 2
|
stride = 2
|
||||||
|
|
||||||
sequence_0_no_special_tokens = tokenizer.encode(seq_0)
|
sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
|
||||||
sequence_1_no_special_tokens = tokenizer.encode(seq_1)
|
sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
|
||||||
|
|
||||||
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
|
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
|
||||||
truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
|
truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
|
||||||
tokenizer.encode(seq_0),
|
tokenizer.encode(seq_0, add_special_tokens=False),
|
||||||
tokenizer.encode(seq_1)[:-2]
|
tokenizer.encode(seq_1, add_special_tokens=False)[:-2]
|
||||||
)
|
)
|
||||||
|
|
||||||
information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
|
information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
|
||||||
@@ -283,7 +284,7 @@ class CommonTestCases:
|
|||||||
sequence_1 = "This one too please."
|
sequence_1 = "This one too please."
|
||||||
|
|
||||||
# Testing single inputs
|
# Testing single inputs
|
||||||
encoded_sequence = tokenizer.encode(sequence_0)
|
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
|
||||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
||||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||||
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||||
@@ -294,7 +295,8 @@ class CommonTestCases:
|
|||||||
self.assertEqual(encoded_sequence, filtered_sequence)
|
self.assertEqual(encoded_sequence, filtered_sequence)
|
||||||
|
|
||||||
# Testing inputs pairs
|
# Testing inputs pairs
|
||||||
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
|
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(sequence_1,
|
||||||
|
add_special_tokens=False)
|
||||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
|
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
|
||||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||||
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||||
|
|||||||
@@ -69,8 +69,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
def test_sequence_builders(self):
|
def test_sequence_builders(self):
|
||||||
tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
|
tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
|
||||||
|
|
||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders", add_special_tokens=False)
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
|
||||||
|
|
||||||
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|||||||
@@ -92,8 +92,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
def test_sequence_builders(self):
|
def test_sequence_builders(self):
|
||||||
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
|
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
|
||||||
|
|
||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders", add_special_tokens=False)
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
|
||||||
|
|
||||||
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|||||||
@@ -689,14 +689,14 @@ class PreTrainedTokenizer(object):
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def encode(self,
|
def encode(self,
|
||||||
text,
|
text,
|
||||||
text_pair=None,
|
text_pair=None,
|
||||||
add_special_tokens=False,
|
add_special_tokens=True,
|
||||||
max_length=None,
|
max_length=None,
|
||||||
stride=0,
|
stride=0,
|
||||||
truncation_strategy='longest_first',
|
truncation_strategy='longest_first',
|
||||||
return_tensors=None,
|
return_tensors=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""
|
"""
|
||||||
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
|
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
|
||||||
|
|
||||||
@@ -739,7 +739,7 @@ class PreTrainedTokenizer(object):
|
|||||||
def encode_plus(self,
|
def encode_plus(self,
|
||||||
text,
|
text,
|
||||||
text_pair=None,
|
text_pair=None,
|
||||||
add_special_tokens=False,
|
add_special_tokens=True,
|
||||||
max_length=None,
|
max_length=None,
|
||||||
stride=0,
|
stride=0,
|
||||||
truncation_strategy='longest_first',
|
truncation_strategy='longest_first',
|
||||||
@@ -794,7 +794,7 @@ class PreTrainedTokenizer(object):
|
|||||||
truncation_strategy=truncation_strategy,
|
truncation_strategy=truncation_strategy,
|
||||||
return_tensors=return_tensors)
|
return_tensors=return_tensors)
|
||||||
|
|
||||||
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
|
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
|
||||||
truncation_strategy='longest_first', return_tensors=None):
|
truncation_strategy='longest_first', return_tensors=None):
|
||||||
"""
|
"""
|
||||||
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
|
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
|
||||||
|
|||||||
Reference in New Issue
Block a user