From 5b5ee235f3239413e9614bd02032b1a203dab710 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 29 Aug 2023 15:08:14 +0200 Subject: [PATCH] [`LlamaTokenizer`] `tokenize` nits. (#25793) * return when length is zero * Add tests Co-authored-by: Avnish Narayan <38871737avnishn@users.noreply.github.com> * Co-authored-by: avnishn <38871737+avnishn@users.noreply.github.com> * codeLlama doc should not be on Main * update test --------- Co-authored-by: Avnish Narayan <38871737avnishn@users.noreply.github.com> --- README.md | 2 +- .../models/llama/tokenization_llama.py | 9 ++--- src/transformers/models/t5/tokenization_t5.py | 10 ++--- tests/models/llama/test_tokenization_llama.py | 31 ++++++++++++++++ tests/models/t5/test_tokenization_t5.py | 37 +++++++++++++++++++ 5 files changed, 78 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 4170e489ae..543a13108b 100644 --- a/README.md +++ b/README.md @@ -318,7 +318,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. -1. **[CodeLlama](https://huggingface.co/docs/transformers/main/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. +1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py index bd345cca2e..9ef5d0c717 100644 --- a/src/transformers/models/llama/tokenization_llama.py +++ b/src/transformers/models/llama/tokenization_llama.py @@ -200,18 +200,17 @@ class LlamaTokenizer(PreTrainedTokenizer): return vocab # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize - def tokenize(self, text: "TextInput", **kwargs) -> List[str]: + def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]: """ Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the first token is special. """ - if self.legacy: + if self.legacy or len(text) == 0: return super().tokenize(text, **kwargs) - if len(text) > 0: - tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs) + tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs) - if tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens: + if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens: tokens = tokens[1:] return tokens diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index e01ff0103c..4d3a3a3d8d 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -351,18 +351,18 @@ class T5Tokenizer(PreTrainedTokenizer): self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) - def tokenize(self, text: "TextInput", **kwargs) -> List[str]: + # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize + def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]: """ Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the first token is special. """ - if self.legacy: + if self.legacy or len(text) == 0: return super().tokenize(text, **kwargs) - if len(text) > 0: - tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs) + tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs) - if tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens: + if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens: tokens = tokens[1:] return tokens diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index 25b4f4d8f1..9223e626f9 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -555,6 +555,25 @@ class LlamaIntegrationTest(unittest.TestCase): self.assertNotEqual(sp_tokens, tokens) self.assertEqual(tokens, ["", ">"]) + tokens = tokenizer.tokenize("") + self.assertEqual(tokens, []) + self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str)) + + tokens = tokenizer.tokenize(" ") + self.assertEqual(tokens, ["▁▁"]) + # a dummy prefix space is not added by the sp_model as it was de-activated + self.assertEqual(tokens, tokenizer.sp_model.encode(" ", out_type=str)) + + tokens = tokenizer.tokenize("▁") + self.assertEqual(tokens, ["▁▁"]) + # a dummy prefix space is not added by the sp_model as it was de-activated + self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str)) + + tokens = tokenizer.tokenize(" ▁") + self.assertEqual(tokens, ["▁▁▁"]) + # a dummy prefix space is not added by the sp_model as it was de-activated + self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁▁", out_type=str)) + @require_sentencepiece @require_tokenizers @@ -583,6 +602,18 @@ class CommonSpmIntegrationTests(unittest.TestCase): tokens = self.tokenizer.tokenize(". Hello") self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"]) + tokens = self.tokenizer.tokenize("") + self.assertEqual(tokens, []) + self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str)) + + tokens = self.tokenizer.tokenize(" ") + self.assertEqual(tokens, []) + self.assertEqual(tokens, self.tokenizer.sp_model.encode(" ", out_type=str)) + + tokens = self.tokenizer.tokenize("▁") + self.assertEqual(tokens, []) + self.assertEqual(tokens, self.tokenizer.sp_model.encode("▁", out_type=str)) + def test_remove_extra_whitespaces(self): # make sure the extra spaces are eaten. Since the sample vocab does not have # `______`. sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute is set to False diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index d559044205..efbe37d75e 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -400,6 +400,31 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10) self.assertListEqual(sorted(tokenizer.get_sentinel_token_ids()), sorted(range(1000, 1010))) + def test_some_edge_cases(self): + tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False) + + sp_tokens = tokenizer.sp_model.encode(">", out_type=str) + self.assertEqual(sp_tokens, ["<", "/", "s", ">", ">"]) + tokens = tokenizer.tokenize(">") + self.assertNotEqual(sp_tokens, tokens) + self.assertEqual(tokens, ["", ">"]) + + tokens = tokenizer.tokenize("") + self.assertEqual(tokens, []) + self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str)) + + tokens = tokenizer.tokenize(" ") + self.assertEqual(tokens, []) + self.assertEqual(tokens, tokenizer.sp_model.encode(" ", out_type=str)) + + tokens = tokenizer.tokenize("▁") + self.assertEqual(tokens, []) + self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str)) + + tokens = tokenizer.tokenize(" ▁") + self.assertEqual(tokens, []) + self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str)) + @require_sentencepiece @require_tokenizers @@ -427,6 +452,18 @@ class CommonSpmIntegrationTests(unittest.TestCase): tokens = self.tokenizer.tokenize(". Hello") self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"]) + tokens = self.tokenizer.tokenize("") + self.assertEqual(tokens, []) + self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str)) + + tokens = self.tokenizer.tokenize(" ") + self.assertEqual(tokens, []) + self.assertEqual(tokens, self.tokenizer.sp_model.encode(" ", out_type=str)) + + tokens = self.tokenizer.tokenize("▁") + self.assertEqual(tokens, []) + self.assertEqual(tokens, self.tokenizer.sp_model.encode("▁", out_type=str)) + def test_remove_extra_whitespaces(self): # make sure the extra spaces are eaten # sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute