diff --git a/docs/source/en/model_doc/codegen.md b/docs/source/en/model_doc/codegen.md index 78be813db1..bee8c8a076 100644 --- a/docs/source/en/model_doc/codegen.md +++ b/docs/source/en/model_doc/codegen.md @@ -72,6 +72,7 @@ hello_world() ## CodeGenTokenizer [[autodoc]] CodeGenTokenizer + - create_token_type_ids_from_sequences - save_vocabulary ## CodeGenTokenizerFast diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py index abf64e1892..1b03af7008 100644 --- a/src/transformers/models/codegen/tokenization_codegen.py +++ b/src/transformers/models/codegen/tokenization_codegen.py @@ -134,6 +134,8 @@ class CodeGenTokenizer(PreTrainedTokenizer): other word. (CodeGen tokenizer detect beginning of words by the preceding space). add_bos_token (`bool`, *optional*, defaults to `False`): Whether to add a beginning of sequence token at the start of sequences. + return_token_type_ids (`bool`, *optional*, defaults to `False`): + Whether to return token type IDs. """ vocab_files_names = VOCAB_FILES_NAMES @@ -150,6 +152,7 @@ class CodeGenTokenizer(PreTrainedTokenizer): pad_token=None, add_prefix_space=False, add_bos_token=False, + return_token_type_ids=False, **kwargs, ): bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token @@ -157,6 +160,9 @@ class CodeGenTokenizer(PreTrainedTokenizer): unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token self.add_bos_token = add_bos_token + self.return_token_type_ids = return_token_type_ids + if self.return_token_type_ids: + self.model_input_names.append("token_type_ids") with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) @@ -181,6 +187,7 @@ class CodeGenTokenizer(PreTrainedTokenizer): pad_token=pad_token, add_prefix_space=add_prefix_space, add_bos_token=add_bos_token, + return_token_type_ids=return_token_type_ids, **kwargs, ) @@ -270,6 +277,35 @@ class CodeGenTokenizer(PreTrainedTokenizer): text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) return text + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A sequence + pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] if self.sep_token_id is not None else [] + cls = [self.cls_token_id] if self.sep_token_id is not None else [] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error(f"Vocabulary path ({save_directory}) should be a directory") diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py index fb9f0442e0..b086fb84a6 100644 --- a/src/transformers/models/codegen/tokenization_codegen_fast.py +++ b/src/transformers/models/codegen/tokenization_codegen_fast.py @@ -91,6 +91,8 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast): add_prefix_space (`bool`, *optional*, defaults to `False`): Whether or not to add an initial space to the input. This allows to treat the leading word just as any other word. (CodeGen tokenizer detect beginning of words by the preceding space). + return_token_type_ids (`bool`, *optional*, defaults to `False`): + Whether to return token type IDs. """ vocab_files_names = VOCAB_FILES_NAMES @@ -106,8 +108,13 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast): bos_token="<|endoftext|>", eos_token="<|endoftext|>", add_prefix_space=False, + return_token_type_ids=False, **kwargs, ): + self.return_token_type_ids = return_token_type_ids + if self.return_token_type_ids: + self.model_input_names.append("token_type_ids") + super().__init__( vocab_file, merges_file, @@ -116,6 +123,7 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast): bos_token=bos_token, eos_token=eos_token, add_prefix_space=add_prefix_space, + return_token_type_ids=return_token_type_ids, **kwargs, ) @@ -157,6 +165,36 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast): return super()._encode_plus(*args, **kwargs) + # Copied from transformers.models.codegen.tokenization_codegen.CodeGenTokenizer.create_token_type_ids_from_sequences + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A sequence + pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] if self.sep_token_id is not None else [] + cls = [self.cls_token_id] if self.sep_token_id is not None else [] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py index 025ed99b9a..e7945089c0 100644 --- a/tests/models/codegen/test_tokenization_codegen.py +++ b/tests/models/codegen/test_tokenization_codegen.py @@ -264,3 +264,55 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # tokenizer has no padding token def test_padding_different_model_input_name(self): pass + + @slow + def test_tokenizer_integration(self): + # Custom test since this tokenizer takes return_token_type_ids as an init argument for backward compatibility. + + sequences = [ + "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides " + "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural " + "Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained " + "models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.", + "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly " + "conditioning on both left and right context in all layers.", + "The quick brown fox jumps over the lazy dog.", + ] + + tokenizer_classes = [self.tokenizer_class] + if self.test_rust_tokenizer: + tokenizer_classes.append(self.rust_tokenizer_class) + + # Test default case. i.e. return_token_type_ids is False. + for tokenizer_class in tokenizer_classes: + tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono") + + encoding = tokenizer(sequences) + decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]] + + # fmt: off + expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501 + # fmt: on + + encoding_data = encoding.data + self.assertDictEqual(encoding_data, expected_encoding) + + for expected, decoded in zip(sequences, decoded_sequences): + self.assertEqual(expected, decoded) + + # Test return_token_type_ids is True case. + for tokenizer_class in tokenizer_classes: + tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono", return_token_type_ids=True) + + encoding = tokenizer(sequences) + decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]] + + # fmt: off + expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501 + # fmt: on + + encoding_data = encoding.data + self.assertDictEqual(encoding_data, expected_encoding) + + for expected, decoded in zip(sequences, decoded_sequences): + self.assertEqual(expected, decoded)