From 588e6caa1534389cc3f9117d219376c02249dbea Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Mon, 23 Aug 2021 09:41:35 +0200 Subject: [PATCH] Overwrite get_clean_sequence as this was causing a bottleneck (#13183) --- tests/test_tokenization_luke.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_tokenization_luke.py b/tests/test_tokenization_luke.py index 84bf52a0f3..148e7de27b 100644 --- a/tests/test_tokenization_luke.py +++ b/tests/test_tokenization_luke.py @@ -15,6 +15,7 @@ import unittest +from typing import Tuple from transformers import AddedToken, LukeTokenizer from transformers.testing_utils import require_torch, slow @@ -81,6 +82,11 @@ class Luke(TokenizerTesterMixin, unittest.TestCase): assert encoded_sentence == encoded_text_from_decode assert encoded_pair == encoded_pair_from_decode + def get_clean_sequence(self, tokenizer, max_length=20) -> Tuple[str, list]: + txt = "Beyonce lives in Los Angeles" + ids = tokenizer.encode(txt, add_special_tokens=False) + return txt, ids + def test_space_encoding(self): tokenizer = self.get_tokenizer()