Update-llama-code (#25826)

* some bug fixes

* updates

* Update code_llama.md

Co-authored-by: Omar Sanseviero <osanseviero@users.noreply.github.com>

* Add co author

Co-authored-by: pcuenca <pedro@latenitesoft.com>

* add a test

* fixup

* nits

* some updates

* fix-coies

* adress comments

* nits

* nits

* fix docsting

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* update

* add int for https://huggingface.co/spaces/hf-accelerate/model-memory-usage

---------

Co-authored-by: Omar Sanseviero <osanseviero@users.noreply.github.com>
Co-authored-by: pcuenca <pedro@latenitesoft.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Arthur
2023-09-01 20:40:40 +02:00
committed by GitHub
parent 3587769c08
commit a4dd53d88e
4 changed files with 88 additions and 46 deletions

View File

@@ -65,6 +65,11 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname)
def test_no_infilling_init(self):
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, prefix_token=None, keep_accents=True)
with self.assertRaises(ValueError):
tokenizer.tokenize("This is <FILL_ME> prefix")
def test_full_tokenizer(self):
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
@@ -587,8 +592,8 @@ split,
end
""",
]
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-hf")
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
formatted_prompt = tokenizer.tokenize(PROMPTS[0])
self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0]))