[breaking|pipelines|tokenizers] Adding slow-fast tokenizers equivalence tests pipelines - Removing sentencepiece as a required dependency (#8073)

* Fixing roberta for slow-fast tests

* WIP getting equivalence on pipelines

* slow-to-fast equivalence - working on question-answering pipeline

* optional FAISS tests

* Pipeline Q&A

* Move pipeline tests to their own test job again

* update tokenizer to add sequence id methods

* update to tokenizers 0.9.4

* set sentencepiecce as optional

* clean up squad

* clean up pipelines to use sequence_ids

* style/quality

* wording

* Switch to use_fast = True by default

* update tests for use_fast at True by default

* fix rag tokenizer test

* removing protobuf from required dependencies

* fix NER test for use_fast = True by default

* fixing example tests (Q&A examples use slow tokenizers for now)

* protobuf in main deps extras["sentencepiece"] and example deps

* fix protobug install test

* try to fix seq2seq by switching to slow tokenizers for now

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
Thomas Wolf
2020-11-15 22:50:59 +01:00
committed by GitHub
parent 24184e73c4
commit f4e04cd2c6
23 changed files with 689 additions and 262 deletions

View File

@@ -1,10 +1,10 @@
from typing import List, Optional
from unittest import mock
from transformers import is_tf_available, is_torch_available, pipeline
# from transformers.pipelines import DefaultArgumentHandler, Pipeline
from transformers.pipelines import Pipeline
from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow
from transformers.tokenization_utils_base import to_py_obj
VALID_INPUTS = ["A simple string", ["list of strings"]]
@@ -13,9 +13,11 @@ VALID_INPUTS = ["A simple string", ["list of strings"]]
@is_pipeline_test
class CustomInputPipelineCommonMixin:
pipeline_task = None
pipeline_loading_kwargs = {}
small_models = None # Models tested without the @slow decorator
large_models = None # Models tested with the @slow decorator
pipeline_loading_kwargs = {} # Additional kwargs to load the pipeline with
pipeline_running_kwargs = {} # Additional kwargs to run the pipeline with
small_models = [] # Models tested without the @slow decorator
large_models = [] # Models tested with the @slow decorator
valid_inputs = VALID_INPUTS # Some inputs which are valid to compare fast and slow tokenizers
def setUp(self) -> None:
if not is_tf_available() and not is_torch_available():
@@ -47,73 +49,11 @@ class CustomInputPipelineCommonMixin:
@require_torch
@slow
def test_pt_defaults(self):
pipeline(self.pipeline_task, framework="pt")
@require_tf
@slow
def test_tf_defaults(self):
pipeline(self.pipeline_task, framework="tf")
@require_torch
def test_torch_small(self):
for model_name in self.small_models:
nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="pt")
self._test_pipeline(nlp)
@require_tf
def test_tf_small(self):
for model_name in self.small_models:
nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="tf")
self._test_pipeline(nlp)
@require_torch
@slow
def test_torch_large(self):
for model_name in self.large_models:
nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="pt")
self._test_pipeline(nlp)
@require_tf
@slow
def test_tf_large(self):
for model_name in self.large_models:
nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="tf")
self._test_pipeline(nlp)
def _test_pipeline(self, nlp: Pipeline):
raise NotImplementedError
@is_pipeline_test
class MonoInputPipelineCommonMixin:
pipeline_task = None
pipeline_loading_kwargs = {} # Additional kwargs to load the pipeline with
pipeline_running_kwargs = {} # Additional kwargs to run the pipeline with
small_models = [] # Models tested without the @slow decorator
large_models = [] # Models tested with the @slow decorator
mandatory_keys = {} # Keys which should be in the output
valid_inputs = VALID_INPUTS # inputs which are valid
invalid_inputs = [None] # inputs which are not allowed
expected_multi_result: Optional[List] = None
expected_check_keys: Optional[List[str]] = None
def setUp(self) -> None:
if not is_tf_available() and not is_torch_available():
return # Currently no JAX pipelines
for model_name in self.small_models:
pipeline(self.pipeline_task, model=model_name, tokenizer=model_name, **self.pipeline_loading_kwargs)
for model_name in self.large_models:
pipeline(self.pipeline_task, model=model_name, tokenizer=model_name, **self.pipeline_loading_kwargs)
@require_torch
@slow
def test_pt_defaults_loads(self):
pipeline(self.pipeline_task, framework="pt", **self.pipeline_loading_kwargs)
@require_tf
@slow
def test_tf_defaults_loads(self):
def test_tf_defaults(self):
pipeline(self.pipeline_task, framework="tf", **self.pipeline_loading_kwargs)
@require_torch
@@ -166,6 +106,95 @@ class MonoInputPipelineCommonMixin:
)
self._test_pipeline(nlp)
def _test_pipeline(self, nlp: Pipeline):
raise NotImplementedError
@require_torch
def test_compare_slow_fast_torch(self):
for model_name in self.small_models:
nlp_slow = pipeline(
task=self.pipeline_task,
model=model_name,
tokenizer=model_name,
framework="pt",
use_fast=False,
**self.pipeline_loading_kwargs,
)
nlp_fast = pipeline(
task=self.pipeline_task,
model=model_name,
tokenizer=model_name,
framework="pt",
use_fast=True,
**self.pipeline_loading_kwargs,
)
self._compare_slow_fast_pipelines(nlp_slow, nlp_fast, method="forward")
@require_tf
def test_compare_slow_fast_tf(self):
for model_name in self.small_models:
nlp_slow = pipeline(
task=self.pipeline_task,
model=model_name,
tokenizer=model_name,
framework="tf",
use_fast=False,
**self.pipeline_loading_kwargs,
)
nlp_fast = pipeline(
task=self.pipeline_task,
model=model_name,
tokenizer=model_name,
framework="tf",
use_fast=True,
**self.pipeline_loading_kwargs,
)
self._compare_slow_fast_pipelines(nlp_slow, nlp_fast, method="call")
def _compare_slow_fast_pipelines(self, nlp_slow: Pipeline, nlp_fast: Pipeline, method: str):
"""We check that the inputs to the models forward passes are identical for
slow and fast tokenizers.
"""
with mock.patch.object(
nlp_slow.model, method, wraps=getattr(nlp_slow.model, method)
) as mock_slow, mock.patch.object(nlp_fast.model, method, wraps=getattr(nlp_fast.model, method)) as mock_fast:
for inputs in self.valid_inputs:
if isinstance(inputs, dict):
inputs.update(self.pipeline_running_kwargs)
_ = nlp_slow(**inputs)
_ = nlp_fast(**inputs)
else:
_ = nlp_slow(inputs, **self.pipeline_running_kwargs)
_ = nlp_fast(inputs, **self.pipeline_running_kwargs)
mock_slow.assert_called()
mock_fast.assert_called()
self.assertEqual(len(mock_slow.call_args_list), len(mock_fast.call_args_list))
for mock_slow_call_args, mock_fast_call_args in zip(
mock_slow.call_args_list, mock_slow.call_args_list
):
slow_call_args, slow_call_kwargs = mock_slow_call_args
fast_call_args, fast_call_kwargs = mock_fast_call_args
slow_call_args, slow_call_kwargs = to_py_obj(slow_call_args), to_py_obj(slow_call_kwargs)
fast_call_args, fast_call_kwargs = to_py_obj(fast_call_args), to_py_obj(fast_call_kwargs)
self.assertEqual(slow_call_args, fast_call_args)
self.assertDictEqual(slow_call_kwargs, fast_call_kwargs)
@is_pipeline_test
class MonoInputPipelineCommonMixin(CustomInputPipelineCommonMixin):
"""A version of the CustomInputPipelineCommonMixin
with a predefined `_test_pipeline` method.
"""
mandatory_keys = {} # Keys which should be in the output
invalid_inputs = [None] # inputs which are not allowed
expected_multi_result: Optional[List] = None
expected_check_keys: Optional[List[str]] = None
def _test_pipeline(self, nlp: Pipeline):
self.assertIsNotNone(nlp)
@@ -199,76 +228,3 @@ class MonoInputPipelineCommonMixin:
self.assertIn(key, result)
self.assertRaises(Exception, nlp, self.invalid_inputs)
# @is_pipeline_test
# class DefaultArgumentHandlerTestCase(unittest.TestCase):
# def setUp(self) -> None:
# self.handler = DefaultArgumentHandler()
#
# def test_kwargs_x(self):
# mono_data = {"X": "This is a sample input"}
# mono_args = self.handler(**mono_data)
#
# self.assertTrue(isinstance(mono_args, list))
# self.assertEqual(len(mono_args), 1)
#
# multi_data = {"x": ["This is a sample input", "This is a second sample input"]}
# multi_args = self.handler(**multi_data)
#
# self.assertTrue(isinstance(multi_args, list))
# self.assertEqual(len(multi_args), 2)
#
# def test_kwargs_data(self):
# mono_data = {"data": "This is a sample input"}
# mono_args = self.handler(**mono_data)
#
# self.assertTrue(isinstance(mono_args, list))
# self.assertEqual(len(mono_args), 1)
#
# multi_data = {"data": ["This is a sample input", "This is a second sample input"]}
# multi_args = self.handler(**multi_data)
#
# self.assertTrue(isinstance(multi_args, list))
# self.assertEqual(len(multi_args), 2)
#
# def test_multi_kwargs(self):
# mono_data = {"data": "This is a sample input", "X": "This is a sample input 2"}
# mono_args = self.handler(**mono_data)
#
# self.assertTrue(isinstance(mono_args, list))
# self.assertEqual(len(mono_args), 2)
#
# multi_data = {
# "data": ["This is a sample input", "This is a second sample input"],
# "test": ["This is a sample input 2", "This is a second sample input 2"],
# }
# multi_args = self.handler(**multi_data)
#
# self.assertTrue(isinstance(multi_args, list))
# self.assertEqual(len(multi_args), 4)
#
# def test_args(self):
# mono_data = "This is a sample input"
# mono_args = self.handler(mono_data)
#
# self.assertTrue(isinstance(mono_args, list))
# self.assertEqual(len(mono_args), 1)
#
# mono_data = ["This is a sample input"]
# mono_args = self.handler(mono_data)
#
# self.assertTrue(isinstance(mono_args, list))
# self.assertEqual(len(mono_args), 1)
#
# multi_data = ["This is a sample input", "This is a second sample input"]
# multi_args = self.handler(multi_data)
#
# self.assertTrue(isinstance(multi_args, list))
# self.assertEqual(len(multi_args), 2)
#
# multi_data = ["This is a sample input", "This is a second sample input"]
# multi_args = self.handler(*multi_data)
#
# self.assertTrue(isinstance(multi_args, list))
# self.assertEqual(len(multi_args), 2)

View File

@@ -1,29 +0,0 @@
import unittest
from transformers.pipelines import Conversation, Pipeline
from .test_pipelines_common import CustomInputPipelineCommonMixin
class DialoguePipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
pipeline_task = "conversational"
small_models = [] # Default model - Models tested without the @slow decorator
large_models = ["microsoft/DialoGPT-medium"] # Models tested with the @slow decorator
def _test_pipeline(self, nlp: Pipeline):
valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]]
invalid_inputs = ["Hi there!", Conversation()]
self.assertIsNotNone(nlp)
mono_result = nlp(valid_inputs[0])
self.assertIsInstance(mono_result, Conversation)
multi_result = nlp(valid_inputs[1])
self.assertIsInstance(multi_result, list)
self.assertIsInstance(multi_result[0], Conversation)
# Inactive conversations passed to the pipeline raise a ValueError
self.assertRaises(ValueError, nlp, valid_inputs[1])
for bad_input in invalid_inputs:
self.assertRaises(Exception, nlp, bad_input)
self.assertRaises(Exception, nlp, invalid_inputs)

View File

@@ -146,10 +146,10 @@ class NerPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
@require_torch
def test_pt_ignore_subwords_slow_tokenizer_raises(self):
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
with self.assertRaises(ValueError):
pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True)
pipeline(task="ner", model=model_name, tokenizer=tokenizer, ignore_subwords=True, use_fast=False)
@require_torch
def test_pt_defaults_slow_tokenizer(self):

View File

@@ -8,10 +8,22 @@ from .test_pipelines_common import CustomInputPipelineCommonMixin
class QAPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
pipeline_task = "question-answering"
pipeline_running_kwargs = {
"padding": "max_length",
"max_seq_len": 25,
"doc_stride": 5,
} # Default is 'longest' but we use 'max_length' to test equivalence between slow/fast tokenizers
small_models = [
"sshleifer/tiny-distilbert-base-cased-distilled-squad"
] # Models tested without the @slow decorator
large_models = [] # Models tested with the @slow decorator
valid_inputs = [
{"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
{
"question": "In what field is HuggingFace working ?",
"context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.",
},
]
def _test_pipeline(self, nlp: Pipeline):
output_keys = {"score", "answer", "start", "end"}

View File

@@ -12,6 +12,18 @@ class ZeroShotClassificationPipelineTests(CustomInputPipelineCommonMixin, unitte
"sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english"
] # Models tested without the @slow decorator
large_models = ["roberta-large-mnli"] # Models tested with the @slow decorator
valid_inputs = [
{"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics"},
{"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics"]},
{"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics, public health"},
{"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics", "public health"]},
{"sequences": ["Who are you voting for in 2020?"], "candidate_labels": "politics"},
{
"sequences": "Who are you voting for in 2020?",
"candidate_labels": "politics",
"hypothesis_template": "This text is about {}",
},
]
def _test_scores_sum_to_one(self, result):
sum = 0.0

View File

@@ -9,7 +9,7 @@ from unittest.mock import patch
import numpy as np
from datasets import Dataset
import faiss
from transformers import is_faiss_available
from transformers.configuration_bart import BartConfig
from transformers.configuration_dpr import DPRConfig
from transformers.configuration_rag import RagConfig
@@ -27,6 +27,10 @@ from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
if is_faiss_available():
import faiss
@require_faiss
@require_datasets
class RagRetrieverTest(TestCase):

View File

@@ -116,5 +116,5 @@ class AutoTokenizerTest(unittest.TestCase):
@require_tokenizers
def test_from_pretrained_use_fast_toggle(self):
self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizer)
self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True), BertTokenizerFast)
self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False), BertTokenizer)
self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizerFast)

View File

@@ -576,6 +576,42 @@ class TokenizerTesterMixin:
sequences, mask = information["input_ids"], information["token_type_ids"]
self.assertEqual(len(sequences), len(mask))
def test_token_type_ids(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
seq_0 = "Test this method."
# We want to have sequence 0 and sequence 1 are tagged
# respectively with 0 and 1 token_ids
# (regardeless of weither the model use token type ids)
# We use this assumption in the QA pipeline among other place
output = tokenizer(seq_0, return_token_type_ids=True)
self.assertIn(0, output["token_type_ids"])
def test_sequence_ids(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
if not tokenizer.is_fast:
continue
with self.subTest(f"{tokenizer.__class__.__name__}"):
seq_0 = "Test this method."
seq_1 = "With these inputs."
# We want to have sequence 0 and sequence 1 are tagged
# respectively with 0 and 1 token_ids
# (regardeless of weither the model use token type ids)
# We use this assumption in the QA pipeline among other place
output = tokenizer(seq_0)
self.assertIn(0, output.sequence_ids())
output = tokenizer(seq_0, seq_1)
self.assertIn(0, output.sequence_ids())
self.assertIn(1, output.sequence_ids())
if tokenizer.num_special_tokens_to_add(pair=True):
self.assertIn(None, output.sequence_ids())
def test_number_of_added_tokens(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
@@ -1878,6 +1914,144 @@ class TokenizerTesterMixin:
batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1
)
# Assert token_to_sequence
self.assertEqual(encoding.token_to_sequence(num_tokens // 2), 0)
self.assertEqual(encoding.token_to_sequence(0, num_tokens // 2), 0)
self.assertEqual(batch_encoding.token_to_sequence(1, num_tokens // 2), 0)
self.assertEqual(batch_encoding.token_to_sequence(0, num_tokens // 2), 0)
self.assertEqual(batch_encoding.token_to_sequence(last_batch_index, num_tokens // 2), 0)
# Pair of input sequences
words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
text = " ".join(words)
pair_words = ["Amazing", "example", "full", "of", "inspiration"]
pair_text = " ".join(pair_words)
batch_size = 3
index_word_in_first_seq = words.index("inspiration")
index_word_in_pair_seq = pair_words.index("inspiration")
index_char_in_first_seq = text.find("inspiration")
index_char_in_pair_seq = pair_text.find("inspiration")
pair_encoding = tokenizer_r.encode_plus(text, pair_text, add_special_tokens=False)
pair_batch_encoding = tokenizer_r.batch_encode_plus(
[(text, pair_text)] * batch_size, add_special_tokens=False
)
num_tokens = len(encoding["input_ids"])
last_word_index = len(words) - 1
last_token_index = num_tokens - 1
last_batch_index = batch_size - 1
last_char_index = len(text) - 1
# Assert word_to_tokens
self.assertNotEqual(
pair_encoding.word_to_tokens(index_word_in_first_seq, sequence_index=0).start,
pair_encoding.word_to_tokens(index_word_in_pair_seq, sequence_index=1).start,
)
self.assertEqual(
pair_encoding["input_ids"][
pair_encoding.word_to_tokens(index_word_in_first_seq, sequence_index=0).start
],
pair_encoding["input_ids"][
pair_encoding.word_to_tokens(index_word_in_pair_seq, sequence_index=1).start
],
)
self.assertNotEqual(
pair_batch_encoding.word_to_tokens(1, index_word_in_first_seq, sequence_index=0).start,
pair_batch_encoding.word_to_tokens(1, index_word_in_pair_seq, sequence_index=1).start,
)
self.assertEqual(
pair_batch_encoding["input_ids"][1][
pair_batch_encoding.word_to_tokens(1, index_word_in_first_seq, sequence_index=0).start
],
pair_batch_encoding["input_ids"][1][
pair_batch_encoding.word_to_tokens(1, index_word_in_pair_seq, sequence_index=1).start
],
)
# Assert char_to_token
self.assertNotEqual(
pair_encoding.char_to_token(index_char_in_first_seq, sequence_index=0),
pair_encoding.char_to_token(index_char_in_pair_seq, sequence_index=1),
)
self.assertEqual(
pair_encoding["input_ids"][pair_encoding.char_to_token(index_char_in_first_seq, sequence_index=0)],
pair_encoding["input_ids"][pair_encoding.char_to_token(index_char_in_pair_seq, sequence_index=1)],
)
self.assertNotEqual(
pair_batch_encoding.char_to_token(1, index_char_in_first_seq, sequence_index=0),
pair_batch_encoding.char_to_token(1, index_char_in_pair_seq, sequence_index=1),
)
self.assertEqual(
pair_batch_encoding["input_ids"][1][
pair_batch_encoding.char_to_token(1, index_char_in_first_seq, sequence_index=0)
],
pair_batch_encoding["input_ids"][1][
pair_batch_encoding.char_to_token(1, index_char_in_pair_seq, sequence_index=1)
],
)
# Assert char_to_word
self.assertNotEqual(
pair_encoding.char_to_word(index_char_in_first_seq, sequence_index=0),
pair_encoding.char_to_word(index_char_in_pair_seq, sequence_index=1),
)
self.assertEqual(
words[pair_encoding.char_to_word(index_char_in_first_seq, sequence_index=0)],
pair_words[pair_encoding.char_to_word(index_char_in_pair_seq, sequence_index=1)],
)
self.assertNotEqual(
pair_batch_encoding.char_to_word(1, index_char_in_first_seq, sequence_index=0),
pair_batch_encoding.char_to_word(1, index_char_in_pair_seq, sequence_index=1),
)
self.assertEqual(
words[pair_batch_encoding.char_to_word(1, index_char_in_first_seq, sequence_index=0)],
pair_words[pair_batch_encoding.char_to_word(1, index_char_in_pair_seq, sequence_index=1)],
)
# Assert word_to_chars
self.assertNotEqual(
pair_encoding.word_to_chars(index_word_in_first_seq, sequence_index=0).start,
pair_encoding.word_to_chars(index_word_in_pair_seq, sequence_index=1).start,
)
self.assertEqual(
text[pair_encoding.word_to_chars(index_word_in_first_seq, sequence_index=0).start],
pair_text[pair_encoding.word_to_chars(index_word_in_pair_seq, sequence_index=1).start],
)
self.assertNotEqual(
pair_batch_encoding.word_to_chars(1, index_word_in_first_seq, sequence_index=0).start,
pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start,
)
self.assertEqual(
text[pair_batch_encoding.word_to_chars(1, index_word_in_first_seq, sequence_index=0).start],
pair_text[pair_batch_encoding.word_to_chars(1, index_word_in_pair_seq, sequence_index=1).start],
)
# Assert token_to_sequence
pair_encoding = tokenizer_r.encode_plus(text, pair_text, add_special_tokens=True)
pair_sequence_ids = [
pair_encoding.token_to_sequence(i) for i in range(len(pair_encoding["input_ids"]))
]
self.assertIn(0, pair_sequence_ids)
self.assertIn(1, pair_sequence_ids)
if tokenizer_r.num_special_tokens_to_add(pair=True):
self.assertIn(None, pair_sequence_ids)
pair_batch_encoding = tokenizer_r.batch_encode_plus(
[(text, pair_text)] * batch_size, add_special_tokens=True
)
pair_batch_sequence_ids = [
pair_batch_encoding.token_to_sequence(1, i)
for i in range(len(pair_batch_encoding["input_ids"][0]))
]
self.assertIn(0, pair_batch_sequence_ids)
self.assertIn(1, pair_batch_sequence_ids)
if tokenizer_r.num_special_tokens_to_add(pair=True):
self.assertIn(None, pair_batch_sequence_ids)
def test_tokenization_python_rust_equals(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):

View File

@@ -4,13 +4,12 @@ import shutil
import tempfile
from unittest import TestCase
from transformers import BartTokenizer, BartTokenizerFast, DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast
from transformers.configuration_bart import BartConfig
from transformers.configuration_dpr import DPRConfig
from transformers.file_utils import is_datasets_available, is_faiss_available, is_torch_available
from transformers.testing_utils import require_datasets, require_faiss, require_torch, slow
from transformers.tokenization_bart import BartTokenizer
from transformers.testing_utils import require_datasets, require_faiss, require_tokenizers, require_torch, slow
from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
@@ -96,6 +95,7 @@ class RagTokenizerTest(TestCase):
def tearDown(self):
shutil.rmtree(self.tmpdirname)
@require_tokenizers
def test_save_load_pretrained_with_saved_config(self):
save_dir = os.path.join(self.tmpdirname, "rag_tokenizer")
@@ -104,10 +104,10 @@ class RagTokenizerTest(TestCase):
rag_config.save_pretrained(save_dir)
rag_tokenizer.save_pretrained(save_dir)
new_rag_tokenizer = RagTokenizer.from_pretrained(save_dir, config=rag_config)
self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizer)
self.assertEqual(new_rag_tokenizer.question_encoder.vocab, rag_tokenizer.question_encoder.vocab)
self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizer)
self.assertEqual(new_rag_tokenizer.generator.encoder, rag_tokenizer.generator.encoder)
self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizerFast)
self.assertEqual(new_rag_tokenizer.question_encoder.get_vocab(), rag_tokenizer.question_encoder.get_vocab())
self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizerFast)
self.assertEqual(new_rag_tokenizer.generator.get_vocab(), rag_tokenizer.generator.get_vocab())
@slow
def test_pretrained_token_nq_tokenizer(self):

View File

@@ -18,7 +18,7 @@ import os
import unittest
from transformers.file_utils import cached_property
from transformers.testing_utils import slow
from transformers.testing_utils import require_sentencepiece, slow
from transformers.tokenization_xlm_prophetnet import SPIECE_UNDERLINE, XLMProphetNetTokenizer
from .test_tokenization_common import TokenizerTesterMixin
@@ -27,6 +27,7 @@ from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
@require_sentencepiece
class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMProphetNetTokenizer