Overwrite get_clean_sequence as this was causing a bottleneck (#13183)
This commit is contained in:
@@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
from transformers import AddedToken, LukeTokenizer
|
from transformers import AddedToken, LukeTokenizer
|
||||||
from transformers.testing_utils import require_torch, slow
|
from transformers.testing_utils import require_torch, slow
|
||||||
@@ -81,6 +82,11 @@ class Luke(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
assert encoded_sentence == encoded_text_from_decode
|
assert encoded_sentence == encoded_text_from_decode
|
||||||
assert encoded_pair == encoded_pair_from_decode
|
assert encoded_pair == encoded_pair_from_decode
|
||||||
|
|
||||||
|
def get_clean_sequence(self, tokenizer, max_length=20) -> Tuple[str, list]:
|
||||||
|
txt = "Beyonce lives in Los Angeles"
|
||||||
|
ids = tokenizer.encode(txt, add_special_tokens=False)
|
||||||
|
return txt, ids
|
||||||
|
|
||||||
def test_space_encoding(self):
|
def test_space_encoding(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user