Fix BatchEncoding.word_to_tokens for removed tokens (#7939)

This commit is contained in:
Anthony MOI
2020-10-23 10:29:37 -04:00
committed by GitHub
parent 4acfd1a8dc
commit 5e323017a4
2 changed files with 16 additions and 5 deletions

View File

@@ -18,7 +18,7 @@ from typing import Callable, Optional
import numpy as np
from transformers import BatchEncoding, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, TensorType
from transformers import BatchEncoding, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, TensorType, TokenSpan
from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow
from transformers.tokenization_gpt2 import GPT2Tokenizer
@@ -142,6 +142,15 @@ class TokenizerUtilsTest(unittest.TestCase):
with self.subTest("Rust Tokenizer"):
self.assertTrue(tokenizer_r("Small example to_encode").is_fast)
@require_tokenizers
def test_batch_encoding_word_to_tokens(self):
tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased")
encoded = tokenizer_r(["Test", "\xad", "test"], is_split_into_words=True)
self.assertEqual(encoded.word_to_tokens(0), TokenSpan(start=1, end=2))
self.assertEqual(encoded.word_to_tokens(1), None)
self.assertEqual(encoded.word_to_tokens(2), TokenSpan(start=2, end=3))
def test_batch_encoding_with_labels(self):
batch = BatchEncoding({"inputs": [[1, 2, 3], [4, 5, 6]], "labels": [0, 1]})
tensor_batch = batch.convert_to_tensors(tensor_type="np")