Add token type ids to CodeGenTokenizer (#29265)

* Add create token type ids to CodeGenTokenizer * Fix inconsistent length of token type ids * Format source codes * Fix inconsistent order of methods * Update docstring * add test_tokenizer_integration test * Format source codes * Add `copied from` comment to CodeGenTokenizerFast * Add doc of create_token_type_ids_from_sequences * Make return_token_type_ids False by default * Make test_tokenizer_integration as slow test * Add return_token_type_ids to tokenizer init arg * Add test for tokenizer's init return_token_type_ids * Format source codes
2024-04-17 19:19:18 +09:00
parent 812a5de229
commit 8d6b509611
4 changed files with 127 additions and 0 deletions
--- a/docs/source/en/model_doc/codegen.md
+++ b/docs/source/en/model_doc/codegen.md
@@ -72,6 +72,7 @@ hello_world()
 ## CodeGenTokenizer
 [[autodoc]] CodeGenTokenizer
    - create_token_type_ids_from_sequences
    - save_vocabulary
 ## CodeGenTokenizerFast
--- a/src/transformers/models/codegen/tokenization_codegen.py
+++ b/src/transformers/models/codegen/tokenization_codegen.py
@@ -134,6 +134,8 @@ class CodeGenTokenizer(PreTrainedTokenizer):
            other word. (CodeGen tokenizer detect beginning of words by the preceding space).
        add_bos_token (`bool`, *optional*, defaults to `False`):
            Whether to add a beginning of sequence token at the start of sequences.
        return_token_type_ids (`bool`, *optional*, defaults to `False`):
            Whether to return token type IDs.
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -150,6 +152,7 @@ class CodeGenTokenizer(PreTrainedTokenizer):
        pad_token=None,
        add_prefix_space=False,
        add_bos_token=False,
        return_token_type_ids=False,
        **kwargs,
    ):
        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
@@ -157,6 +160,9 @@ class CodeGenTokenizer(PreTrainedTokenizer):
        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
        self.add_bos_token = add_bos_token
        self.return_token_type_ids = return_token_type_ids
        if self.return_token_type_ids:
            self.model_input_names.append("token_type_ids")
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
@@ -181,6 +187,7 @@ class CodeGenTokenizer(PreTrainedTokenizer):
            pad_token=pad_token,
            add_prefix_space=add_prefix_space,
            add_bos_token=add_bos_token,
            return_token_type_ids=return_token_type_ids,
            **kwargs,
        )
@@ -270,6 +277,35 @@ class CodeGenTokenizer(PreTrainedTokenizer):
        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        return text
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A sequence
        pair mask has the following format:
        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```
        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id] if self.sep_token_id is not None else []
        cls = [self.cls_token_id] if self.sep_token_id is not None else []
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
--- a/src/transformers/models/codegen/tokenization_codegen_fast.py
+++ b/src/transformers/models/codegen/tokenization_codegen_fast.py
@@ -91,6 +91,8 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (CodeGen tokenizer detect beginning of words by the preceding space).
        return_token_type_ids (`bool`, *optional*, defaults to `False`):
            Whether to return token type IDs.
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -106,8 +108,13 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        add_prefix_space=False,
        return_token_type_ids=False,
        **kwargs,
    ):
        self.return_token_type_ids = return_token_type_ids
        if self.return_token_type_ids:
            self.model_input_names.append("token_type_ids")
        super().__init__(
            vocab_file,
            merges_file,
@@ -116,6 +123,7 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
            bos_token=bos_token,
            eos_token=eos_token,
            add_prefix_space=add_prefix_space,
            return_token_type_ids=return_token_type_ids,
            **kwargs,
        )
@@ -157,6 +165,36 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
        return super()._encode_plus(*args, **kwargs)
    # Copied from transformers.models.codegen.tokenization_codegen.CodeGenTokenizer.create_token_type_ids_from_sequences
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A sequence
        pair mask has the following format:
        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```
        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id] if self.sep_token_id is not None else []
        cls = [self.cls_token_id] if self.sep_token_id is not None else []
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)
--- a/tests/models/codegen/test_tokenization_codegen.py
+++ b/tests/models/codegen/test_tokenization_codegen.py
@@ -264,3 +264,55 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    # tokenizer has no padding token
    def test_padding_different_model_input_name(self):
        pass
    @slow
    def test_tokenizer_integration(self):
        # Custom test since this tokenizer takes return_token_type_ids as an init argument for backward compatibility.
        sequences = [
            "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
            "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
            "Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained "
            "models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
            "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
            "conditioning on both left and right context in all layers.",
            "The quick brown fox jumps over the lazy dog.",
        ]
        tokenizer_classes = [self.tokenizer_class]
        if self.test_rust_tokenizer:
            tokenizer_classes.append(self.rust_tokenizer_class)
        # Test default case. i.e. return_token_type_ids is False.
        for tokenizer_class in tokenizer_classes:
            tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono")
            encoding = tokenizer(sequences)
            decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
            # fmt: off
            expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
            # fmt: on
            encoding_data = encoding.data
            self.assertDictEqual(encoding_data, expected_encoding)
            for expected, decoded in zip(sequences, decoded_sequences):
                self.assertEqual(expected, decoded)
        # Test return_token_type_ids is True case.
        for tokenizer_class in tokenizer_classes:
            tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono", return_token_type_ids=True)
            encoding = tokenizer(sequences)
            decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
            # fmt: off
            expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
            # fmt: on
            encoding_data = encoding.data
            self.assertDictEqual(encoding_data, expected_encoding)
            for expected, decoded in zip(sequences, decoded_sequences):
                self.assertEqual(expected, decoded)