diff --git a/templates/adding_a_missing_tokenization_test/README.md b/templates/adding_a_missing_tokenization_test/README.md new file mode 100644 index 0000000000..935f21c5ca --- /dev/null +++ b/templates/adding_a_missing_tokenization_test/README.md @@ -0,0 +1,39 @@ + + +This folder contains a template to add a tokenization test. + +## Usage + +Using the `cookiecutter` utility requires to have all the `dev` dependencies installed. + +Let's first [fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) the `transformers` repo on github. Once it's done you can clone your fork and install `transformers` in our environment: + +```shell script +git clone https://github.com/YOUR-USERNAME/transformers +cd transformers +pip install -e ".[dev]" +``` + +Once the installation is done, you can generate the template by running the following command. Be careful, the template will be generated inside a new folder in your current working directory. + +```shell script +cookiecutter path-to-the folder/adding_a_missing_tokenization_test/ +``` + +You will then have to answer some questions about the tokenizer for which you want to add tests. The `modelname` should be cased according to the plain text casing, i.e., BERT, RoBERTa, DeBERTa. + +Once the command has finished, you should have a one new file inside the newly created folder named `test_tokenization_Xxx.py`. At this point the template is finished and you can move it to the sub-folder of the corresponding model in the test folder. diff --git a/templates/adding_a_missing_tokenization_test/cookiecutter-template-{{cookiecutter.modelname}}/test_tokenization_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_missing_tokenization_test/cookiecutter-template-{{cookiecutter.modelname}}/test_tokenization_{{cookiecutter.lowercase_modelname}}.py new file mode 100644 index 0000000000..631886f6b2 --- /dev/null +++ b/templates/adding_a_missing_tokenization_test/cookiecutter-template-{{cookiecutter.modelname}}/test_tokenization_{{cookiecutter.lowercase_modelname}}.py @@ -0,0 +1,78 @@ +# coding=utf-8 +# Copyright 2022 {{cookiecutter.authors}}. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the {{cookiecutter.modelname}} tokenizer. """ + + +import unittest + +{% if cookiecutter.has_slow_class == "True" and cookiecutter.has_fast_class == "True" -%} +from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, {{cookiecutter.camelcase_modelname}}TokenizerFast +{% elif cookiecutter.has_slow_class == "True" -%} +from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer +{% elif cookiecutter.has_fast_class == "True" -%} +from transformers import {{cookiecutter.camelcase_modelname}}TokenizerFast +{% endif -%} +{% if cookiecutter.has_fast_class == "True" and cookiecutter.slow_tokenizer_use_sentencepiece == "True" -%} +from transformers.testing_utils import require_sentencepiece, require_tokenizers +from ..test_tokenization_common import TokenizerTesterMixin + + +@require_sentencepiece +@require_tokenizers +{% elif cookiecutter.slow_tokenizer_use_sentencepiece == "True" -%} +from transformers.testing_utils import require_sentencepiece +from ..test_tokenization_common import TokenizerTesterMixin + + +@require_sentencepiece +{% elif cookiecutter.has_fast_class == "True" -%} +from transformers.testing_utils import require_tokenizers +from ..test_tokenization_common import TokenizerTesterMixin + + +@require_tokenizers +{% else -%} +from ..test_tokenization_common import TokenizerTesterMixin + + +{% endif -%} +class {{cookiecutter.camelcase_modelname}}TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + {% if cookiecutter.has_slow_class == "True" -%} + tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer + test_slow_tokenizer = True + {% else -%} + tokenizer_class = None + test_slow_tokenizer = False + {% endif -%} + {% if cookiecutter.has_fast_class == "True" -%} + rust_tokenizer_class = {{cookiecutter.camelcase_modelname}}TokenizerFast + test_rust_tokenizer = True + {% else -%} + rust_tokenizer_class = None + test_rust_tokenizer = False + {% endif -%} + {% if cookiecutter.slow_tokenizer_use_sentencepiece == "True" -%} + test_sentencepiece = True + {% endif -%} + # TODO: Check in `TokenizerTesterMixin` if other attributes need to be changed + def setUp(self): + super().setUp() + + raise NotImplementedError( + "Here you have to implement the saving of a toy tokenizer in " + "`self.tmpdirname`." + ) + + # TODO: add tests with hard-coded target values \ No newline at end of file diff --git a/templates/adding_a_missing_tokenization_test/cookiecutter.json b/templates/adding_a_missing_tokenization_test/cookiecutter.json new file mode 100644 index 0000000000..2e53818f9b --- /dev/null +++ b/templates/adding_a_missing_tokenization_test/cookiecutter.json @@ -0,0 +1,10 @@ +{ + "modelname": "BrandNewBERT", + "uppercase_modelname": "BRAND_NEW_BERT", + "lowercase_modelname": "brand_new_bert", + "camelcase_modelname": "BrandNewBert", + "has_slow_class": ["True", "False"], + "has_fast_class": ["True", "False"], + "slow_tokenizer_use_sentencepiece": ["True", "False"], + "authors": "The HuggingFace Team" +}