From c94c1b89674f2b15b23c8c4ce30f036bf883717f Mon Sep 17 00:00:00 2001 From: SaulLu <55560583+SaulLu@users.noreply.github.com> Date: Wed, 22 Dec 2021 10:51:55 +0100 Subject: [PATCH] update the arguments `add_prefix_space` and `trim_offsets` in `backend_tokenizer.post_processor` of `RobertaTokenizerFast` (#14752) * add tests * change post-processor, pre-tokenizer and decoder (can't update decoder) * update test (remove decoder which doesn't depend on trim and add_prefix) * just update the post_processor * fix change * `trim_offsets` has no influence on `pre_tokenizer` * remove a test that need some input from the `tokenizers` lib maintainers * format * add new test offsets roberta * polish comments --- .../roberta/tokenization_roberta_fast.py | 33 +++++- tests/test_tokenization_roberta.py | 105 ++++++++++++++++++ 2 files changed, 137 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py index 140c9f6d1d..28162a2994 100644 --- a/src/transformers/models/roberta/tokenization_roberta_fast.py +++ b/src/transformers/models/roberta/tokenization_roberta_fast.py @@ -13,9 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """Fast Tokenization classes for RoBERTa.""" - +import json from typing import List, Optional +from tokenizers import processors + from ...tokenization_utils_base import AddedToken from ...utils import logging from ..gpt2.tokenization_gpt2_fast import GPT2TokenizerFast @@ -162,6 +164,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast): pad_token="", mask_token="", add_prefix_space=False, + trim_offsets=True, **kwargs ): super().__init__( @@ -177,9 +180,37 @@ class RobertaTokenizerFast(GPT2TokenizerFast): pad_token=pad_token, mask_token=mask_token, add_prefix_space=add_prefix_space, + trim_offsets=trim_offsets, **kwargs, ) + # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__` + tokenizer_component = "post_processor" + tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None) + if tokenizer_component_instance: + state = json.loads(tokenizer_component_instance.__getstate__()) + + # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class` + if "sep" in state: + state["sep"] = tuple(state["sep"]) + if "cls" in state: + state["cls"] = tuple(state["cls"]) + + changes_to_apply = False + + if state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + state["add_prefix_space"] = add_prefix_space + changes_to_apply = True + + if state.get("trim_offsets", trim_offsets) != trim_offsets: + state["trim_offsets"] = trim_offsets + changes_to_apply = True + + if changes_to_apply: + component_class = getattr(processors, state.pop("type")) + new_value = component_class(**state) + setattr(self.backend_tokenizer, tokenizer_component, new_value) + @property def mask_token(self) -> str: """ diff --git a/tests/test_tokenization_roberta.py b/tests/test_tokenization_roberta.py index 746c88d0f1..8b37a20e5a 100644 --- a/tests/test_tokenization_roberta.py +++ b/tests/test_tokenization_roberta.py @@ -14,6 +14,7 @@ # limitations under the License. +import itertools import json import os import unittest @@ -196,3 +197,107 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertSequenceEqual( tokens_r_str, ["", "A", ",", "", "ĠAllen", "N", "LP", "Ġsentence", ".", ""] ) + + def test_change_add_prefix_space_and_trim_offsets_args(self): + for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2): + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets + ) + + pre_tokenizer_state = json.loads(tokenizer_r.backend_tokenizer.pre_tokenizer.__getstate__()) + post_processor_state = json.loads(tokenizer_r.backend_tokenizer.post_processor.__getstate__()) + + self.assertEqual(pre_tokenizer_state["add_prefix_space"], add_prefix_space) + + self.assertEqual(post_processor_state["add_prefix_space"], add_prefix_space) + self.assertEqual(post_processor_state["trim_offsets"], trim_offsets) + + def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments(self): + # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space` and + # `trim_offsets` + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name` + text = f"{text_of_1_token} {text_of_1_token}" + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True + ) + encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token))) + self.assertEqual( + encoding.offset_mapping[1], + (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), + ) + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True + ) + encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token))) + self.assertEqual( + encoding.offset_mapping[1], + (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), + ) + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False + ) + encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token))) + self.assertEqual( + encoding.offset_mapping[1], + (len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)), + ) + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False + ) + encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token))) + self.assertEqual( + encoding.offset_mapping[1], + (len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)), + ) + + text = f" {text}" + + # tokenizer_r = self.rust_tokenizer_class.from_pretrained( + # pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True + # ) + # encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + # self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token))) + # self.assertEqual( + # encoding.offset_mapping[1], + # (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), + # ) + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True + ) + encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token))) + self.assertEqual( + encoding.offset_mapping[1], + (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), + ) + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False + ) + encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + self.assertEqual(encoding.offset_mapping[0], (0, 1 + len(text_of_1_token))) + self.assertEqual( + encoding.offset_mapping[1], + (1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), + ) + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False + ) + encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + self.assertEqual(encoding.offset_mapping[0], (0, 1 + len(text_of_1_token))) + self.assertEqual( + encoding.offset_mapping[1], + (1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), + )