[tests|tokenizers] Refactoring pipelines test backbone - Small tokenizers improvements - General tests speedups (#7970)
* WIP refactoring pipeline tests - switching to fast tokenizers * fix dialog pipeline and fill-mask * refactoring pipeline tests backbone * make large tests slow * fix tests (tf Bart inactive for now) * fix doc... * clean up for merge * fixing tests - remove bart from summarization until there is TF * fix quality and RAG * Add new translation pipeline tests - fix JAX tests * only slow for dialog * Fixing the missing TF-BART imports in modeling_tf_auto * spin out pipeline tests in separate CI job * adding pipeline test to CI YAML * add slow pipeline tests * speed up tf and pt join test to avoid redoing all the standalone pt and tf tests * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sam Shleifer <sshleifer@gmail.com> * Update src/transformers/pipelines.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * add require_torch and require_tf in is_pt_tf_cross_test Co-authored-by: Sam Shleifer <sshleifer@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
@@ -7,11 +7,8 @@ import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
from ...file_utils import is_tf_available, is_torch_available
|
||||
from ...tokenization_bart import BartTokenizer
|
||||
from ...tokenization_bert import whitespace_tokenize
|
||||
from ...tokenization_longformer import LongformerTokenizer
|
||||
from ...tokenization_roberta import RobertaTokenizer
|
||||
from ...tokenization_utils_base import TruncationStrategy
|
||||
from ...tokenization_utils_base import PreTrainedTokenizerBase, TruncationStrategy
|
||||
from ...utils import logging
|
||||
from .utils import DataProcessor
|
||||
|
||||
@@ -112,7 +109,14 @@ def squad_convert_example_to_features(
|
||||
all_doc_tokens = []
|
||||
for (i, token) in enumerate(example.doc_tokens):
|
||||
orig_to_tok_index.append(len(all_doc_tokens))
|
||||
if isinstance(tokenizer, (RobertaTokenizer, LongformerTokenizer, BartTokenizer)):
|
||||
if tokenizer.__class__.__name__ in [
|
||||
"RobertaTokenizer",
|
||||
"LongformerTokenizer",
|
||||
"BartTokenizer",
|
||||
"RobertaTokenizerFast",
|
||||
"LongformerTokenizerFast",
|
||||
"BartTokenizerFast",
|
||||
]:
|
||||
sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
|
||||
else:
|
||||
sub_tokens = tokenizer.tokenize(token)
|
||||
@@ -292,7 +296,7 @@ def squad_convert_example_to_features(
|
||||
return features
|
||||
|
||||
|
||||
def squad_convert_example_to_features_init(tokenizer_for_convert):
|
||||
def squad_convert_example_to_features_init(tokenizer_for_convert: PreTrainedTokenizerBase):
|
||||
global tokenizer
|
||||
tokenizer = tokenizer_for_convert
|
||||
|
||||
@@ -344,9 +348,9 @@ def squad_convert_examples_to_features(
|
||||
is_training=not evaluate,
|
||||
)
|
||||
"""
|
||||
|
||||
# Defining helper methods
|
||||
features = []
|
||||
|
||||
threads = min(threads, cpu_count())
|
||||
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
|
||||
annotate_ = partial(
|
||||
@@ -365,6 +369,7 @@ def squad_convert_examples_to_features(
|
||||
disable=not tqdm_enabled,
|
||||
)
|
||||
)
|
||||
|
||||
new_features = []
|
||||
unique_id = 1000000000
|
||||
example_index = 0
|
||||
|
||||
Reference in New Issue
Block a user