[breaking|pipelines|tokenizers] Adding slow-fast tokenizers equivalence tests pipelines - Removing sentencepiece as a required dependency (#8073)

* Fixing roberta for slow-fast tests * WIP getting equivalence on pipelines * slow-to-fast equivalence - working on question-answering pipeline * optional FAISS tests * Pipeline Q&A * Move pipeline tests to their own test job again * update tokenizer to add sequence id methods * update to tokenizers 0.9.4 * set sentencepiecce as optional * clean up squad * clean up pipelines to use sequence_ids * style/quality * wording * Switch to use_fast = True by default * update tests for use_fast at True by default * fix rag tokenizer test * removing protobuf from required dependencies * fix NER test for use_fast = True by default * fixing example tests (Q&A examples use slow tokenizers for now) * protobuf in main deps extras["sentencepiece"] and example deps * fix protobug install test * try to fix seq2seq by switching to slow tokenizers for now * Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
2020-11-15 22:50:59 +01:00
parent 24184e73c4
commit f4e04cd2c6
23 changed files with 689 additions and 262 deletions
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -8,7 +8,7 @@ from tqdm import tqdm

 from ...file_utils import is_tf_available, is_torch_available
 from ...tokenization_bert import whitespace_tokenize
-from ...tokenization_utils_base import PreTrainedTokenizerBase, TruncationStrategy
+from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
 from ...utils import logging
 from .utils import DataProcessor

@@ -765,6 +765,7 @@ class SquadFeatures:
        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
        start_position: start of the answer token index
        end_position: end of the answer token index
+        encoding: optionally store the BatchEncoding with the fast-tokenizer alignement methods.
    """

    def __init__(
@@ -784,6 +785,7 @@ class SquadFeatures:
        end_position,
        is_impossible,
        qas_id: str = None,
+        encoding: BatchEncoding = None,
    ):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
@@ -803,6 +805,8 @@ class SquadFeatures:
        self.is_impossible = is_impossible
        self.qas_id = qas_id

+        self.encoding = encoding
+

 class SquadResult:
    """