use diff internal model in tests (#33387)

* use diff internal model in tests

* use diff internal model in tests
This commit is contained in:
Ita Zaporozhets
2024-09-11 05:27:00 -04:00
committed by GitHub
parent f38590dade
commit 781bbc4d98

View File

@@ -846,7 +846,8 @@ class TikTokenIntegrationTests(unittest.TestCase):
""" """
def test_tiktoken_llama(self): def test_tiktoken_llama(self):
model_path = "hf-internal-testing/Llama3-Instruct-Internal" model_path = "hf-internal-testing/llama-3-8b-internal"
subfolder = "original"
test_text = "This is a test sentence." test_text = "This is a test sentence."
test_tokens = [128000, 2028, 374, 264, 1296, 11914, 13, 128001] test_tokens = [128000, 2028, 374, 264, 1296, 11914, 13, 128001]
num_reserved_special_tokens = 256 num_reserved_special_tokens = 256
@@ -866,6 +867,7 @@ class TikTokenIntegrationTests(unittest.TestCase):
tiktoken_tokenizer = PreTrainedTokenizerFast.from_pretrained( tiktoken_tokenizer = PreTrainedTokenizerFast.from_pretrained(
model_path, model_path,
subfolder=subfolder,
additional_special_tokens=special_tokens, additional_special_tokens=special_tokens,
bos_token="<|begin_of_text|>", bos_token="<|begin_of_text|>",
eos_token="<|end_of_text|>", eos_token="<|end_of_text|>",
@@ -874,7 +876,14 @@ class TikTokenIntegrationTests(unittest.TestCase):
self.assertEqual(tokens[0], "<|begin_of_text|>") self.assertEqual(tokens[0], "<|begin_of_text|>")
tiktoken_tokenizer = AutoTokenizer.from_pretrained( tiktoken_tokenizer = AutoTokenizer.from_pretrained(
model_path, legacy=False, additional_special_tokens=special_tokens, add_bos_token=True, add_eos_token=True model_path,
subfolder=subfolder,
legacy=False,
additional_special_tokens=special_tokens,
bos_token="<|begin_of_text|>",
eos_token="<|end_of_text|>",
add_bos_token=True,
add_eos_token=True,
) )
self.assertTrue(isinstance(tiktoken_tokenizer, PreTrainedTokenizerFast)) self.assertTrue(isinstance(tiktoken_tokenizer, PreTrainedTokenizerFast))
@@ -892,7 +901,10 @@ class TikTokenIntegrationTests(unittest.TestCase):
tiktoken_tokenizer = AutoTokenizer.from_pretrained( tiktoken_tokenizer = AutoTokenizer.from_pretrained(
model_path, model_path,
subfolder=subfolder,
additional_special_tokens=special_tokens, additional_special_tokens=special_tokens,
bos_token="<|begin_of_text|>",
eos_token="<|end_of_text|>",
from_slow=True, from_slow=True,
add_bos_token=True, add_bos_token=True,
add_eos_token=True, add_eos_token=True,