Terminator strings for generate() (#28932)
* stash commit (will discard all of this) * stash commit * First commit - needs a lot of testing! * Add a test * Fix imports and make the tests actually test something * Tests pass! * Rearrange test * Add comments (but it's still a bit confusing) * Stop storing the tokenizer * Comment fixup * Fix for input_ids with a single sequence * Update tests to test single sequences * make fixup * Fix incorrect use of isin() * Expand tests to catch more cases * Expand tests to catch more cases * make fixup * Fix length calculation and update tests * Handle Ġ as a space replacement too * Update src/transformers/generation/stopping_criteria.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Add optimizations from Joao's suggestion * Remove TODO * Update src/transformers/generation/stopping_criteria.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update tests/generation/test_stopping_criteria.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * make fixup * Rename some variables and remove some debugging clauses for clarity * Add tests for the sub-methods * Clarify one test slightly * Add stop_strings to GenerationConfig * generate() supports stop_string arg, asks for tokenizer if not provided * make fixup * Cleanup code and rename variables for clarity * Update tokenizer error * Update tokenizer passing, handle generation on GPU * Slightly more explanation cleanup * More comment cleanup * Factor out the token cleanup so it's more obvious what we're doing, and we can change it later * Careful with that cleanup! * Cleanup + optimizations to _get_matching_positions * More minor performance tweaks * Implement caching and eliminate some expensive ops (startup time: 200ms -> 9ms) * Remove the pin_memory call * Parallelize across all stop strings! * Quick fix for tensor devices * Update embeddings test for the new format * Fix test imports * Manual patching for BERT-like tokenizers * Return a bool vector instead of a single True/False * Better comment * Better comment * Add tests from @zucchini-nlp * Amy's list creation nit * tok_list -> token_list * Push a big expanded docstring (should we put it somewhere else?) * Expand docstrings * Docstring fixups * Rebase * make fixup * Make a properly general method for figuring out token strings * Fix naming throughout the functions * Move cache, refactor, fix tests * Add comment * Remove finished TODO * Remove finished TODO * make fixup * Update src/transformers/generation/stopping_criteria.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update and shorten docstring * Update tests to be shorter/clearer and test specific cases --------- Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
@@ -16,7 +16,7 @@
|
||||
import time
|
||||
import unittest
|
||||
|
||||
from transformers import is_torch_available
|
||||
from transformers import AutoTokenizer, is_torch_available
|
||||
from transformers.testing_utils import require_torch, torch_device
|
||||
|
||||
from ..test_modeling_common import ids_tensor
|
||||
@@ -31,6 +31,7 @@ if is_torch_available():
|
||||
MaxNewTokensCriteria,
|
||||
MaxTimeCriteria,
|
||||
StoppingCriteriaList,
|
||||
StopStringCriteria,
|
||||
validate_stopping_criteria,
|
||||
)
|
||||
|
||||
@@ -124,3 +125,134 @@ class StoppingCriteriaTestCase(unittest.TestCase):
|
||||
stopping_criteria = validate_stopping_criteria(StoppingCriteriaList(), 11)
|
||||
|
||||
self.assertEqual(len(stopping_criteria), 1)
|
||||
|
||||
def test_stop_string_criteria(self):
|
||||
true_strings = [
|
||||
"<|im_start|><|im_end|>",
|
||||
"<|im_start|><|im_end|<|im_end|>",
|
||||
">><|im_start|>>stop",
|
||||
"stop",
|
||||
"e nd",
|
||||
]
|
||||
false_strings = [
|
||||
"<|im_start|><|im_end|",
|
||||
"<|im_start|><|im_end|<|im_end|",
|
||||
"<|im_end|><|im_start|>",
|
||||
"<|im_end|<>stop<|im_end|",
|
||||
"end",
|
||||
"en d",
|
||||
"eNd",
|
||||
"<|im_end|",
|
||||
"|im_end|>",
|
||||
"s",
|
||||
]
|
||||
stop_strings = ["<|im_end|>", "stop", "e nd"]
|
||||
|
||||
# Use a tokenizer that won't actually have special tokens for these
|
||||
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
tokenizer.padding_side = "left"
|
||||
true_input_ids = tokenizer(true_strings, return_tensors="pt", padding="longest", add_special_tokens=False)
|
||||
false_input_ids = tokenizer(false_strings, return_tensors="pt", padding="longest", add_special_tokens=False)
|
||||
|
||||
scores = None
|
||||
criteria = StopStringCriteria(tokenizer=tokenizer, stop_strings=stop_strings)
|
||||
for i in range(len(true_strings)):
|
||||
self.assertTrue(criteria(true_input_ids["input_ids"][i : i + 1], scores))
|
||||
for i in range(len(false_strings)):
|
||||
self.assertFalse(criteria(false_input_ids["input_ids"][i : i + 1], scores))
|
||||
|
||||
# Now try it with a tokenizer where those are actually special tokens
|
||||
tokenizer = AutoTokenizer.from_pretrained("cognitivecomputations/dolphin-2.5-mixtral-8x7b")
|
||||
tokenizer.padding_side = "left"
|
||||
true_input_ids = tokenizer(true_strings, return_tensors="pt", padding="longest", add_special_tokens=False)
|
||||
false_input_ids = tokenizer(false_strings, return_tensors="pt", padding="longest", add_special_tokens=False)
|
||||
|
||||
criteria = StopStringCriteria(tokenizer=tokenizer, stop_strings=stop_strings)
|
||||
for i in range(len(true_strings)):
|
||||
self.assertTrue(criteria(true_input_ids["input_ids"][i : i + 1], scores))
|
||||
for i in range(len(false_strings)):
|
||||
self.assertFalse(criteria(false_input_ids["input_ids"][i : i + 1], scores))
|
||||
|
||||
def test_stop_string_matching_positions(self):
|
||||
stop_string = "stop"
|
||||
token_list = ["last", "top", "topper", "s", "p"]
|
||||
token_indices = list(range(len(token_list)))
|
||||
all_token_valid_positions, all_token_end_overlaps = StopStringCriteria._stop_string_get_matching_positions(
|
||||
token_list=token_list, token_indices=token_indices, stop_strings=[stop_string]
|
||||
)
|
||||
valid_positions = {
|
||||
token_list[idx]: positions for idx, positions in all_token_valid_positions[stop_string].items()
|
||||
}
|
||||
end_overlaps = {token_list[idx]: overlaps for idx, overlaps in all_token_end_overlaps[stop_string].items()}
|
||||
self.assertEqual(valid_positions, {"s": [3], "last": [2]})
|
||||
self.assertEqual(end_overlaps, {"top": [3], "topper": [3], "p": [1]})
|
||||
|
||||
def test_stop_string_embedding_vecs(self):
|
||||
stop_string = "stop"
|
||||
token_list = ["last", "top", "topper", "s", "p"]
|
||||
token_indices = list(range(len(token_list)))
|
||||
embedding_vec, max_valid_positions, max_valid_end_lens = StopStringCriteria._stop_string_create_embedding_vec(
|
||||
token_list=token_list, token_indices=token_indices, stop_strings=[stop_string]
|
||||
)
|
||||
|
||||
# Positions inside the stop string where the token matches (excluding end overlaps)
|
||||
valid_positions = embedding_vec[:, 0].tolist()
|
||||
self.assertEqual(valid_positions, [2, -1, -1, 3, -1])
|
||||
|
||||
# Overlap lengths between end of stop string and start of token
|
||||
end_overlaps = embedding_vec[:, 1].tolist()
|
||||
self.assertEqual(end_overlaps, [-1, 3, 3, -1, 1])
|
||||
|
||||
# Length of each token
|
||||
token_lengths = embedding_vec[:, 2].tolist()
|
||||
self.assertEqual(token_lengths, [len(token) for token in token_list])
|
||||
|
||||
def test_criterias_per_row(self):
|
||||
text = "They completed the challenging puzzle, revealing the hidden image at the end"
|
||||
stop_strings = ["end"]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)
|
||||
|
||||
scores = None
|
||||
criteria = StoppingCriteriaList(
|
||||
[
|
||||
MaxLengthCriteria(max_length=20),
|
||||
StopStringCriteria(tokenizer=tokenizer, stop_strings=stop_strings),
|
||||
]
|
||||
)
|
||||
|
||||
# trigger stopping when at leat one criteria is satisfied, one value per batch
|
||||
self.assertTrue(criteria(inputs["input_ids"], scores))
|
||||
|
||||
# return False when neither is satisfied
|
||||
self.assertFalse(criteria(inputs["input_ids"][:, :-1], scores))
|
||||
|
||||
def test_criterias_per_row_batched(self):
|
||||
text = [
|
||||
"They completed the challenging puzzle, revealing the hidden image at the end",
|
||||
"Today a dragon flew over France",
|
||||
"The aroma of freshly baked pizza filled the kitchen",
|
||||
]
|
||||
stop_strings = ["end"]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
tokenizer.padding_side = "left"
|
||||
inputs = tokenizer(text, return_tensors="pt", padding="longest", add_special_tokens=False)
|
||||
|
||||
scores = None
|
||||
criteria = StoppingCriteriaList(
|
||||
[
|
||||
MaxLengthCriteria(max_length=20),
|
||||
StopStringCriteria(tokenizer=tokenizer, stop_strings=stop_strings),
|
||||
]
|
||||
)
|
||||
|
||||
# trigger stopping when at leat one criteria is satisfied
|
||||
self.assertListEqual(criteria(inputs["input_ids"], scores).tolist(), [True, False, False])
|
||||
|
||||
# False when neither is satisfied
|
||||
self.assertListEqual(criteria(inputs["input_ids"][:, :-1], scores).tolist(), [False, False, False])
|
||||
|
||||
@@ -2330,6 +2330,43 @@ class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMi
|
||||
|
||||
self.assertListEqual(outputs, ["Wie alt sind Sie?"])
|
||||
|
||||
@slow
|
||||
def test_per_row_stopping_criteria(self):
|
||||
text = [
|
||||
"They completed the challenging puzzle, revealing the hidden",
|
||||
"Today a dragon flew over France",
|
||||
"The aroma of freshly baked pizza filled the kitchen",
|
||||
]
|
||||
stop_strings = ["secrets"]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2").to(torch_device)
|
||||
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
|
||||
tokenizer.padding_side = "left"
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
input_ids = tokenizer(text, return_tensors="pt", padding="longest", add_special_tokens=False).input_ids.to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
# normal generation with one stopping criteria
|
||||
out = model.generate(input_ids, max_length=15)
|
||||
out_text = tokenizer.batch_decode(out)
|
||||
expected_out = [
|
||||
"They completed the challenging puzzle, revealing the hidden secrets of the world.\n",
|
||||
"<|endoftext|><|endoftext|><|endoftext|>Today a dragon flew over France and the French government was forced",
|
||||
"The aroma of freshly baked pizza filled the kitchen with a sense of freshness",
|
||||
]
|
||||
self.assertListEqual(out_text, expected_out)
|
||||
|
||||
# generation should stop at "secrets" for first batch only, filling the rest with eos tokens
|
||||
out = model.generate(input_ids, max_length=15, stop_strings=stop_strings, tokenizer=tokenizer)
|
||||
out_text = tokenizer.batch_decode(out)
|
||||
expected_out = [
|
||||
"They completed the challenging puzzle, revealing the hidden secrets<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>",
|
||||
"<|endoftext|><|endoftext|><|endoftext|>Today a dragon flew over France and the French government was forced",
|
||||
"The aroma of freshly baked pizza filled the kitchen with a sense of freshness",
|
||||
]
|
||||
self.assertListEqual(out_text, expected_out)
|
||||
|
||||
def test_constrained_beam_search_mixin_type_checks(self):
|
||||
# PT-only test: TF doesn't have constrained beam search
|
||||
tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/t5-tiny-random")
|
||||
|
||||
Reference in New Issue
Block a user