[tests|tokenizers] Refactoring pipelines test backbone - Small tokenizers improvements - General tests speedups (#7970)

* WIP refactoring pipeline tests - switching to fast tokenizers * fix dialog pipeline and fill-mask * refactoring pipeline tests backbone * make large tests slow * fix tests (tf Bart inactive for now) * fix doc... * clean up for merge * fixing tests - remove bart from summarization until there is TF * fix quality and RAG * Add new translation pipeline tests - fix JAX tests * only slow for dialog * Fixing the missing TF-BART imports in modeling_tf_auto * spin out pipeline tests in separate CI job * adding pipeline test to CI YAML * add slow pipeline tests * speed up tf and pt join test to avoid redoing all the standalone pt and tf tests * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sam Shleifer <sshleifer@gmail.com> * Update src/transformers/pipelines.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * add require_torch and require_tf in is_pt_tf_cross_test Co-authored-by: Sam Shleifer <sshleifer@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
2020-10-23 15:58:19 +02:00
parent 88b3a91e61
commit 3a40cdf58d
32 changed files with 1587 additions and 1143 deletions
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -7,11 +7,8 @@ import numpy as np
 from tqdm import tqdm

 from ...file_utils import is_tf_available, is_torch_available
-from ...tokenization_bart import BartTokenizer
 from ...tokenization_bert import whitespace_tokenize
-from ...tokenization_longformer import LongformerTokenizer
-from ...tokenization_roberta import RobertaTokenizer
-from ...tokenization_utils_base import TruncationStrategy
+from ...tokenization_utils_base import PreTrainedTokenizerBase, TruncationStrategy
 from ...utils import logging
 from .utils import DataProcessor

@@ -112,7 +109,14 @@ def squad_convert_example_to_features(
    all_doc_tokens = []
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
-        if isinstance(tokenizer, (RobertaTokenizer, LongformerTokenizer, BartTokenizer)):
+        if tokenizer.__class__.__name__ in [
+            "RobertaTokenizer",
+            "LongformerTokenizer",
+            "BartTokenizer",
+            "RobertaTokenizerFast",
+            "LongformerTokenizerFast",
+            "BartTokenizerFast",
+        ]:
            sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
        else:
            sub_tokens = tokenizer.tokenize(token)
@@ -292,7 +296,7 @@ def squad_convert_example_to_features(
    return features


-def squad_convert_example_to_features_init(tokenizer_for_convert):
+def squad_convert_example_to_features_init(tokenizer_for_convert: PreTrainedTokenizerBase):
    global tokenizer
    tokenizer = tokenizer_for_convert

@@ -344,9 +348,9 @@ def squad_convert_examples_to_features(
            is_training=not evaluate,
        )
    """
-
    # Defining helper methods
    features = []
+
    threads = min(threads, cpu_count())
    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
        annotate_ = partial(
@@ -365,6 +369,7 @@ def squad_convert_examples_to_features(
                disable=not tqdm_enabled,
            )
        )
+
    new_features = []
    unique_id = 1000000000
    example_index = 0