[breaking|pipelines|tokenizers] Adding slow-fast tokenizers equivalence tests pipelines - Removing sentencepiece as a required dependency (#8073)
* Fixing roberta for slow-fast tests * WIP getting equivalence on pipelines * slow-to-fast equivalence - working on question-answering pipeline * optional FAISS tests * Pipeline Q&A * Move pipeline tests to their own test job again * update tokenizer to add sequence id methods * update to tokenizers 0.9.4 * set sentencepiecce as optional * clean up squad * clean up pipelines to use sequence_ids * style/quality * wording * Switch to use_fast = True by default * update tests for use_fast at True by default * fix rag tokenizer test * removing protobuf from required dependencies * fix NER test for use_fast = True by default * fixing example tests (Q&A examples use slow tokenizers for now) * protobuf in main deps extras["sentencepiece"] and example deps * fix protobug install test * try to fix seq2seq by switching to slow tokenizers for now * Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
@@ -4,13 +4,12 @@ import shutil
|
||||
import tempfile
|
||||
from unittest import TestCase
|
||||
|
||||
from transformers import BartTokenizer, BartTokenizerFast, DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast
|
||||
from transformers.configuration_bart import BartConfig
|
||||
from transformers.configuration_dpr import DPRConfig
|
||||
from transformers.file_utils import is_datasets_available, is_faiss_available, is_torch_available
|
||||
from transformers.testing_utils import require_datasets, require_faiss, require_torch, slow
|
||||
from transformers.tokenization_bart import BartTokenizer
|
||||
from transformers.testing_utils import require_datasets, require_faiss, require_tokenizers, require_torch, slow
|
||||
from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
|
||||
from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer
|
||||
from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
|
||||
|
||||
|
||||
@@ -96,6 +95,7 @@ class RagTokenizerTest(TestCase):
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdirname)
|
||||
|
||||
@require_tokenizers
|
||||
def test_save_load_pretrained_with_saved_config(self):
|
||||
|
||||
save_dir = os.path.join(self.tmpdirname, "rag_tokenizer")
|
||||
@@ -104,10 +104,10 @@ class RagTokenizerTest(TestCase):
|
||||
rag_config.save_pretrained(save_dir)
|
||||
rag_tokenizer.save_pretrained(save_dir)
|
||||
new_rag_tokenizer = RagTokenizer.from_pretrained(save_dir, config=rag_config)
|
||||
self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizer)
|
||||
self.assertEqual(new_rag_tokenizer.question_encoder.vocab, rag_tokenizer.question_encoder.vocab)
|
||||
self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizer)
|
||||
self.assertEqual(new_rag_tokenizer.generator.encoder, rag_tokenizer.generator.encoder)
|
||||
self.assertIsInstance(new_rag_tokenizer.question_encoder, DPRQuestionEncoderTokenizerFast)
|
||||
self.assertEqual(new_rag_tokenizer.question_encoder.get_vocab(), rag_tokenizer.question_encoder.get_vocab())
|
||||
self.assertIsInstance(new_rag_tokenizer.generator, BartTokenizerFast)
|
||||
self.assertEqual(new_rag_tokenizer.generator.get_vocab(), rag_tokenizer.generator.get_vocab())
|
||||
|
||||
@slow
|
||||
def test_pretrained_token_nq_tokenizer(self):
|
||||
|
||||
Reference in New Issue
Block a user