From be236361f1ab5e27bda5c224160321fbd8b67f24 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 29 Oct 2021 11:34:18 +0200 Subject: [PATCH] Adding `batch_size` support for (almost) all pipelines (#13724) * Tentative enabling of `batch_size` for pipelines. * Add systematic test for pipeline batching. * Enabling batch_size on almost all pipelines - Not `zero-shot` (it's already passing stuff as batched so trickier) - Not `QA` (preprocess uses squad features, we need to switch to real tensors at this boundary. * Adding `min_length_for_response` for conversational. * Making CTC, speech mappings avaiable regardless of framework. * Attempt at fixing automatic tests (ffmpeg not enabled for fast tests) * Removing ffmpeg dependency in tests. * Small fixes. * Slight cleanup. * Adding docs and adressing comments. * Quality. * Update docs/source/main_classes/pipelines.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/question_answering.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/zero_shot_classification.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Improving docs. * Update docs/source/main_classes/pipelines.rst Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com> * N -> oberved_batch_size softmax trick. * Follow `padding_side`. * Supporting image pipeline batching (and padding). * Rename `unbatch` -> `loader_batch`. * unbatch_size forgot. * Custom padding for offset mappings. * Attempt to remove librosa. * Adding require_audio. * torchaudio. * Back to using datasets librosa. * Adding help to set a pad_token on the tokenizer. * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Quality. Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com> --- docs/source/main_classes/pipelines.rst | 143 +++++++++++++++ src/transformers/__init__.py | 4 + .../pipelines/automatic_speech_recognition.py | 4 + src/transformers/pipelines/base.py | 172 +++++++++++++++++- src/transformers/pipelines/conversational.py | 8 +- src/transformers/pipelines/fill_mask.py | 4 +- .../pipelines/object_detection.py | 5 +- .../pipelines/question_answering.py | 6 + .../pipelines/token_classification.py | 13 +- .../pipelines/zero_shot_classification.py | 3 + src/transformers/utils/dummy_pt_objects.py | 6 + tests/test_pipelines_audio_classification.py | 23 ++- ..._pipelines_automatic_speech_recognition.py | 57 +++++- tests/test_pipelines_common.py | 118 +++++++++++- tests/test_pipelines_conversational.py | 5 +- tests/test_pipelines_feature_extraction.py | 17 +- tests/test_pipelines_fill_mask.py | 18 +- tests/test_pipelines_image_classification.py | 12 +- tests/test_pipelines_object_detection.py | 7 +- tests/test_pipelines_question_answering.py | 11 +- tests/test_pipelines_summarization.py | 6 +- tests/test_pipelines_text2text_generation.py | 4 +- tests/test_pipelines_text_classification.py | 5 +- tests/test_pipelines_text_generation.py | 8 +- tests/test_pipelines_token_classification.py | 7 +- tests/test_pipelines_translation.py | 19 +- tests/test_pipelines_zero_shot.py | 8 +- 27 files changed, 629 insertions(+), 64 deletions(-) diff --git a/docs/source/main_classes/pipelines.rst b/docs/source/main_classes/pipelines.rst index 1bc60d340f..146c504861 100644 --- a/docs/source/main_classes/pipelines.rst +++ b/docs/source/main_classes/pipelines.rst @@ -71,6 +71,11 @@ GPU. If it doesn't don't hesitate to create an issue. .. code-block:: + import datasets + from transformers import pipeline + from transformers.pipelines.base import KeyDataset + import tqdm + pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0) dataset = datasets.load_dataset("superb", name="asr", split="test") @@ -85,6 +90,144 @@ GPU. If it doesn't don't hesitate to create an issue. .. autofunction:: transformers.pipeline +Pipeline batching +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +All pipelines (except `zero-shot-classification` and `question-answering` currently) can use batching. This will work +whenever the pipeline uses its streaming ability (so when passing lists or :obj:`Dataset`). + +.. code-block:: + + from transformers import pipeline + from transformers.pipelines.base import KeyDataset + import datasets + import tqdm + + dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised") + pipe = pipeline("text-classification", device=0) + for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"): + print(out) + # [{'label': 'POSITIVE', 'score': 0.9998743534088135}] + # Exactly the same output as before, but the content are passed + # as batches to the model + + +.. warning:: + + However, this is not automatically a win for performance. It can be either a 10x speedup or 5x slowdown depending + on hardware, data and the actual model being used. + + Example where it's most a speedup: + + +.. code-block:: + + from transformers import pipeline + from torch.utils.data import Dataset + import tqdm + + + pipe = pipeline("text-classification", device=0) + + + class MyDataset(Dataset): + def __len__(self): + return 5000 + + def __getitem__(self, i): + return "This is a test" + + + dataset = MyDataset() + + for batch_size in [1, 8, 64, 256]: + print("-" * 30) + print(f"Streaming batch_size={batch_size}") + for out in tqdm.tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)): + pass + + +.. code-block:: + + # On GTX 970 + ------------------------------ + Streaming no batching + 100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s] + ------------------------------ + Streaming batch_size=8 + 100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s] + ------------------------------ + Streaming batch_size=64 + 100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s] + ------------------------------ + Streaming batch_size=256 + 100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s] + (diminishing returns, saturated the GPU) + + +Example where it's most a slowdown: + +.. code-block:: + + class MyDataset(Dataset): + def __len__(self): + return 5000 + + def __getitem__(self, i): + if i % 64 == 0: + n = 100 + else: + n = 1 + return "This is a test" * n + +This is a occasional very long sentence compared to the other. In that case, the **whole** batch will need to be 400 +tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on +bigger batches, the program simply crashes. + + +.. code-block:: + + ------------------------------ + Streaming no batching + 100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s] + ------------------------------ + Streaming batch_size=8 + 100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s] + ------------------------------ + Streaming batch_size=64 + 100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s] + ------------------------------ + Streaming batch_size=256 + 0%| | 0/1000 [00:00 + for out in tqdm.tqdm(pipe(dataset, batch_size=256), total=len(dataset)): + .... + q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) + RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch) + + +There are no good (general) solutions for this problem, and your mileage may vary depending on your use cases. Rule of +thumb: + +For users, a rule of thumb is: + +- **Measure performance on your load, with your hardware. Measure, measure, and keep measuring. Real numbers are the + only way to go.** +- If you are latency constrained (live product doing inference), don't batch +- If you are using CPU, don't batch. +- If you are using throughput (you want to run your model on a bunch of static data), on GPU, then: + + - If you have no clue about the size of the sequence_length ("natural" data), by default don't batch, measure and + try tentatively to add it, add OOM checks to recover when it will fail (and it will at some point if you don't + control the sequence_length.) + - If your sequence_length is super regular, then batching is more likely to be VERY interesting, measure and push + it until you get OOMs. + - The larger the GPU the more likely batching is going to be more interesting +- As soon as you enable batching, make sure you can handle OOMs nicely. + + + Implementing a pipeline ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index ef83e85a9f..ffc4e4bda7 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -584,6 +584,7 @@ if is_torch_available(): [ "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING", "MODEL_FOR_CAUSAL_LM_MAPPING", + "MODEL_FOR_CTC_MAPPING", "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING", "MODEL_FOR_MASKED_LM_MAPPING", @@ -594,6 +595,7 @@ if is_torch_available(): "MODEL_FOR_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", + "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING", "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", "MODEL_MAPPING", @@ -2430,6 +2432,7 @@ if TYPE_CHECKING: from .models.auto import ( MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING, MODEL_FOR_CAUSAL_LM_MAPPING, + MODEL_FOR_CTC_MAPPING, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, MODEL_FOR_MASKED_LM_MAPPING, @@ -2440,6 +2443,7 @@ if TYPE_CHECKING: MODEL_FOR_QUESTION_ANSWERING_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, MODEL_MAPPING, diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index c5c249916b..f1dd39c46d 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -169,6 +169,10 @@ class AutomaticSpeechRecognitionPipeline(Pipeline): elif model_class in MODEL_FOR_CTC_MAPPING.values(): outputs = self.model(**model_inputs) tokens = outputs.logits.squeeze(0).argmax(dim=-1) + else: + logger.warning("This is an unknown class, treating it as CTC.") + outputs = self.model(**model_inputs) + tokens = outputs.logits.squeeze(0).argmax(dim=-1) return tokens def postprocess(self, model_outputs): diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index 6497e71f56..cf4414103b 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -25,6 +25,7 @@ from contextlib import contextmanager from os.path import abspath, exists from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +import numpy as np from packaging import version from ..feature_extraction_utils import PreTrainedFeatureExtractor @@ -59,12 +60,80 @@ if TYPE_CHECKING: logger = logging.get_logger(__name__) -def collate_fn(items): +def no_collate_fn(items): if len(items) != 1: raise ValueError("This collate_fn is meant to be used with batch_size=1") return items[0] +def _pad(items, key, padding_value, padding_side): + batch_size = len(items) + if isinstance(items[0][key], torch.Tensor): + # Others include `attention_mask` etc... + shape = items[0][key].shape + dim = len(shape) + if dim == 4: + # This is probable image so padding shouldn't be necessary + # B, C, H, W + return torch.cat([item[key] for item in items], dim=0) + max_length = max(item[key].shape[1] for item in items) + dtype = items[0][key].dtype + + if dim == 2: + tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value + elif dim == 3: + tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value + + for i, item in enumerate(items): + if dim == 2: + if padding_side == "left": + tensor[i, -len(item[key][0]) :] = item[key][0].clone() + else: + tensor[i, : len(item[key][0])] = item[key][0].clone() + elif dim == 3: + if padding_side == "left": + tensor[i, -len(item[key][0]) :, :] = item[key][0].clone() + else: + tensor[i, : len(item[key][0]), :] = item[key][0].clone() + return tensor + else: + return [item[key] for item in items] + + +def pad_collate_fn(tokenizer, feature_extractor): + padding_side = "right" + if tokenizer is None and feature_extractor is None: + raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching") + if tokenizer is not None: + if tokenizer.pad_token_id is None: + raise ValueError( + "Pipeline with tokenizer without pad_token cannot do batching. You can try to set it with " + "`pipe.tokenizer.pad_token_id = model.config.eos_token_id`." + ) + else: + padding_value = tokenizer.pad_token_id + padding_side = tokenizer.padding_side + if feature_extractor is not None: + # Feature extractor can be images, where no padding is expected + padding_value = getattr(feature_extractor, "padding_value", None) + padding_side = getattr(feature_extractor, "padding_side", None) + + def inner(items): + keys = set(items[0].keys()) + for item in items: + if set(item.keys()) != keys: + raise ValueError( + f"The elements of the batch contain different keys. Cannot batch them ({set(item.keys())} != {keys})" + ) + # input_values, input_pixels, input_ids, ... + padded = { + key: _pad(items, key, padding_value if key.startswith("input_") else 0, padding_side) for key in keys + } + return padded + + return inner + + def infer_framework_load_model( model, config: AutoConfig, @@ -591,6 +660,13 @@ PIPELINE_INIT_ARGS = r""" is provided. task (:obj:`str`, defaults to :obj:`""`): A task-identifier for the pipeline. + num_workers (:obj:`int`, `optional`, defaults to 8): + When the pipeline will use `DataLoader` (when passing a dataset, on GPU for a Pytorch model), the number of + workers to be used. + batch_size (:obj:`int`, `optional`, defaults to 1): + When the pipeline will use `DataLoader` (when passing a dataset, on GPU for a Pytorch model), the size of + the batch to use, for inference this is not always beneficial, please read `Batching with pipelines + `_ . args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`): Reference to the object in charge of parsing supplied pipeline parameters. device (:obj:`int`, `optional`, defaults to -1): @@ -617,10 +693,44 @@ if is_torch_available(): return processed class PipelineIterator(IterableDataset): - def __init__(self, loader, infer, params): + def __init__(self, loader, infer, params, loader_batch_size=None): + """ + Roughly equivalent to + + .. code-block:: + for item in loader: + yield infer(item, **params) + + Arguments: + loader (:obj:`torch.utils.data.DataLoader` or any iterator): + The iterator that will be used to apply :obj:`infer` on. + infer (any function): + The function to apply of each element of :obj:`loader`. + params (:obj:`dict`): + The parameters passed to :obj:`infer` along with every item + loader_batch_size (:obj:`int`, `optional`): + If specified, the items of :obj:`loader` are supposed to come as batch, and are loader_batched here + making it roughly behave as + + + .. code-block:: + + for items in loader: + for i in loader_batch_size: + item = items[i] + yield infer(item, **params) + """ self.loader = loader self.infer = infer self.params = params + if loader_batch_size == 1: + # Let's spare some time by deactivating altogether + loader_batch_size = None + self.loader_batch_size = loader_batch_size + + # Internal bookkeeping + self._loader_batch_index = None + self._loader_batch_data = None def __len__(self): return len(self.loader) @@ -629,10 +739,49 @@ if is_torch_available(): self.iterator = iter(self.loader) return self + def loader_batch_item(self): + if isinstance(self._loader_batch_data, torch.Tensor): + result = self._loader_batch_data[self._loader_batch_index] + else: + loader_batched = {} + for k, element in self._loader_batch_data.items(): + if k == "past_key_values": + continue + if isinstance(element[self._loader_batch_index], torch.Tensor): + loader_batched[k] = element[self._loader_batch_index].unsqueeze(0) + elif isinstance(element[self._loader_batch_index], np.ndarray): + loader_batched[k] = np.expand_dims(element[self._loader_batch_index], 0) + else: + loader_batched[k] = element[self._loader_batch_index] + result = self._loader_batch_data.__class__(loader_batched) + self._loader_batch_index += 1 + return result + def __next__(self): + if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size: + return self.loader_batch_item() + item = next(self.iterator) processed = self.infer(item, **self.params) - return processed + if self.loader_batch_size is not None: + if isinstance(processed, torch.Tensor): + first_tensor = processed + else: + key = list(processed.keys())[0] + first_tensor = processed[key] + if isinstance(first_tensor, list): + observed_batch_size = len(first_tensor) + else: + observed_batch_size = first_tensor.shape[0] + if 0 < observed_batch_size < self.loader_batch_size: + # Could be last batch so we can't unroll as many + # elements. + self.loader_batch_size = observed_batch_size + self._loader_batch_data = processed + self._loader_batch_index = 0 + return self.loader_batch_item() + else: + return processed class KeyDataset(Dataset): def __init__(self, dataset: Dataset, key: str): @@ -881,17 +1030,20 @@ class Pipeline(_ScikitCompat): raise ValueError(f"Framework {self.framework} is not supported") return model_outputs - def get_iterator(self, inputs, num_workers: int, preprocess_params, forward_params, postprocess_params): + def get_iterator( + self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params + ): if "TOKENIZERS_PARALLELISM" not in os.environ: logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already") os.environ["TOKENIZERS_PARALLELISM"] = "false" dataset = PipelineDataset(inputs, self.preprocess, preprocess_params) - dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=1, collate_fn=collate_fn) - model_iterator = PipelineIterator(dataloader, self.forward, forward_params) + collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, self.feature_extractor) + dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn) + model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size) final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params) return final_iterator - def __call__(self, inputs, *args, num_workers=8, **kwargs): + def __call__(self, inputs, *args, num_workers=0, batch_size=1, **kwargs): if args: logger.warning(f"Ignoring args : {args}") preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs) @@ -910,14 +1062,16 @@ class Pipeline(_ScikitCompat): if isinstance(inputs, list): if self.framework == "pt": final_iterator = self.get_iterator( - inputs, num_workers, preprocess_params, forward_params, postprocess_params + inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params ) outputs = [output for output in final_iterator] return outputs else: return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params) elif Dataset is not None and isinstance(inputs, Dataset): - return self.get_iterator(inputs, num_workers, preprocess_params, forward_params, postprocess_params) + return self.get_iterator( + inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params + ) else: return self.run_single(inputs, preprocess_params, forward_params, postprocess_params) diff --git a/src/transformers/pipelines/conversational.py b/src/transformers/pipelines/conversational.py index 4cb0e888a4..2fd90061a5 100644 --- a/src/transformers/pipelines/conversational.py +++ b/src/transformers/pipelines/conversational.py @@ -243,7 +243,7 @@ class ConversationalPipeline(Pipeline): return outputs[0] return outputs - def preprocess(self, conversation: Conversation) -> Dict[str, Any]: + def preprocess(self, conversation: Conversation, min_length_for_response=32) -> Dict[str, Any]: if not isinstance(conversation, Conversation): raise ValueError("ConversationalPipeline, expects Conversation as inputs") if conversation.new_user_input is None: @@ -274,18 +274,18 @@ class ConversationalPipeline(Pipeline): if "attention_mask" in model_inputs: model_inputs["attention_mask"] = model_inputs["attention_mask"][:, -trim:] conversation = model_inputs.pop("conversation") - model_inputs["max_length"] = max_length + generate_kwargs["max_length"] = max_length output_ids = self.model.generate(**model_inputs, **generate_kwargs) if self.model.config.is_encoder_decoder: start_position = 1 else: start_position = n - return {"output_ids": output_ids[0, start_position:], "conversation": conversation} + return {"output_ids": output_ids[:, start_position:], "conversation": conversation} def postprocess(self, model_outputs, clean_up_tokenization_spaces=True): output_ids = model_outputs["output_ids"] answer = self.tokenizer.decode( - output_ids, + output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) diff --git a/src/transformers/pipelines/fill_mask.py b/src/transformers/pipelines/fill_mask.py index 5392db979b..c1a03f6220 100644 --- a/src/transformers/pipelines/fill_mask.py +++ b/src/transformers/pipelines/fill_mask.py @@ -89,14 +89,14 @@ class FillMaskPipeline(Pipeline): def _forward(self, model_inputs): model_outputs = self.model(**model_inputs) - model_outputs["input_ids"] = model_inputs["input_ids"][0] + model_outputs["input_ids"] = model_inputs["input_ids"] return model_outputs def postprocess(self, model_outputs, top_k=5, target_ids=None): # Cap top_k if there are targets if target_ids is not None and target_ids.shape[0] < top_k: top_k = target_ids.shape[0] - input_ids = model_outputs["input_ids"] + input_ids = model_outputs["input_ids"][0] outputs = model_outputs["logits"] result = [] diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py index a3496b304f..6ecdc41f38 100644 --- a/src/transformers/pipelines/object_detection.py +++ b/src/transformers/pipelines/object_detection.py @@ -114,11 +114,12 @@ class ObjectDetectionPipeline(Pipeline): def _forward(self, model_inputs): target_size = model_inputs.pop("target_size") outputs = self.model(**model_inputs) - model_outputs = {"outputs": outputs, "target_size": target_size} + model_outputs = outputs.__class__({"target_size": target_size, **outputs}) return model_outputs def postprocess(self, model_outputs, threshold=0.9): - raw_annotations = self.feature_extractor.post_process(model_outputs["outputs"], model_outputs["target_size"]) + target_size = model_outputs["target_size"] + raw_annotations = self.feature_extractor.post_process(model_outputs, target_size) raw_annotation = raw_annotations[0] keep = raw_annotation["scores"] > threshold scores = raw_annotation["scores"][keep] diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py index 585fd78dd0..ec864e0a20 100644 --- a/src/transformers/pipelines/question_answering.py +++ b/src/transformers/pipelines/question_answering.py @@ -8,9 +8,12 @@ from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_featur from ..file_utils import PaddingStrategy, add_end_docstrings, is_tf_available, is_torch_available from ..modelcard import ModelCard from ..tokenization_utils import PreTrainedTokenizer +from ..utils import logging from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline +logger = logging.get_logger(__name__) + if TYPE_CHECKING: from ..modeling_tf_utils import TFPreTrainedModel from ..modeling_utils import PreTrainedModel @@ -241,6 +244,9 @@ class QuestionAnsweringPipeline(Pipeline): - **end** (:obj:`int`) -- The character end index of the answer (in the tokenized version of the input). - **answer** (:obj:`str`) -- The answer to the question. """ + if kwargs.get("batch_size", 1) > 1: + logger.error("Batch_size > 1 is not supported for question answering pipeline, setting it to 1.") + kwargs["batch_size"] = 1 # Convert inputs to features examples = self._args_parser(*args, **kwargs) diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py index fc3fce2366..49ede61dc8 100644 --- a/src/transformers/pipelines/token_classification.py +++ b/src/transformers/pipelines/token_classification.py @@ -204,12 +204,12 @@ class TokenClassificationPipeline(Pipeline): offset_mapping = model_inputs.pop("offset_mapping", None) sentence = model_inputs.pop("sentence") if self.framework == "tf": - outputs = self.model(model_inputs.data)[0][0] + logits = self.model(model_inputs.data)[0] else: - outputs = self.model(**model_inputs)[0][0] + logits = self.model(**model_inputs)[0] return { - "outputs": outputs, + "logits": logits, "special_tokens_mask": special_tokens_mask, "offset_mapping": offset_mapping, "sentence": sentence, @@ -217,13 +217,16 @@ class TokenClassificationPipeline(Pipeline): } def postprocess(self, model_outputs, aggregation_strategy=AggregationStrategy.NONE): - outputs = model_outputs["outputs"].numpy() + logits = model_outputs["logits"][0].numpy() sentence = model_outputs["sentence"] input_ids = model_outputs["input_ids"][0] offset_mapping = model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy() - scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True) + maxes = np.max(logits, axis=-1, keepdims=True) + shifted_exp = np.exp(logits - maxes) + scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + pre_entities = self.gather_pre_entities( sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy ) diff --git a/src/transformers/pipelines/zero_shot_classification.py b/src/transformers/pipelines/zero_shot_classification.py index f308c48b16..7d3aa1b43c 100644 --- a/src/transformers/pipelines/zero_shot_classification.py +++ b/src/transformers/pipelines/zero_shot_classification.py @@ -183,6 +183,9 @@ class ZeroShotClassificationPipeline(Pipeline): - **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood. - **scores** (:obj:`List[float]`) -- The probabilities for each of the labels. """ + if kwargs.get("batch_size", 1) > 1: + logger.error("Batch size > 1 is not supported for zero-shot pipeline, setting batch_size=1.") + kwargs["batch_size"] = 1 if len(args) == 0: pass diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 3ac8fcbd0e..3a4354b953 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -313,6 +313,9 @@ MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None MODEL_FOR_CAUSAL_LM_MAPPING = None +MODEL_FOR_CTC_MAPPING = None + + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None @@ -343,6 +346,9 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = None +MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = None + + MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None diff --git a/tests/test_pipelines_audio_classification.py b/tests/test_pipelines_audio_classification.py index 561d333caa..07d6a3629d 100644 --- a/tests/test_pipelines_audio_classification.py +++ b/tests/test_pipelines_audio_classification.py @@ -24,6 +24,7 @@ from transformers.testing_utils import ( require_datasets, require_tf, require_torch, + require_torchaudio, slow, ) @@ -35,15 +36,16 @@ from .test_pipelines_common import ANY, PipelineTestCaseMeta class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING - @require_datasets - @slow - def run_pipeline_test(self, model, tokenizer, feature_extractor): - import datasets - + def get_test_pipeline(self, model, tokenizer, feature_extractor): audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=feature_extractor) # test with a raw waveform audio = np.zeros((34000,)) + audio2 = np.zeros((14000,)) + return audio_classifier, [audio2, audio] + + def run_pipeline_test(self, audio_classifier, examples): + audio2, audio = examples output = audio_classifier(audio) # by default a model is initialized with num_labels=2 self.assertEqual( @@ -61,10 +63,17 @@ class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest ], ) + self.run_torchaudio(audio_classifier) + + @require_datasets + @require_torchaudio + def run_torchaudio(self, audio_classifier): + import datasets + # test with a local file dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - filename = dataset[0]["file"] - output = audio_classifier(filename) + audio = dataset[0]["audio"]["array"] + output = audio_classifier(audio) self.assertEqual( output, [ diff --git a/tests/test_pipelines_automatic_speech_recognition.py b/tests/test_pipelines_automatic_speech_recognition.py index e2f7644859..a50512887d 100644 --- a/tests/test_pipelines_automatic_speech_recognition.py +++ b/tests/test_pipelines_automatic_speech_recognition.py @@ -14,11 +14,28 @@ import unittest +import numpy as np import pytest -from transformers import AutoFeatureExtractor, AutoTokenizer, Speech2TextForConditionalGeneration, Wav2Vec2ForCTC +from transformers import ( + MODEL_FOR_CTC_MAPPING, + MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, + AutoFeatureExtractor, + AutoTokenizer, + Speech2TextForConditionalGeneration, + Wav2Vec2ForCTC, +) from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline -from transformers.testing_utils import is_pipeline_test, require_datasets, require_torch, require_torchaudio, slow +from transformers.testing_utils import ( + is_pipeline_test, + require_datasets, + require_tf, + require_torch, + require_torchaudio, + slow, +) + +from .test_pipelines_common import ANY, PipelineTestCaseMeta # We can't use this mixin because it assumes TF support. @@ -26,14 +43,42 @@ from transformers.testing_utils import is_pipeline_test, require_datasets, requi @is_pipeline_test -class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): +class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): + model_mapping = { + k: v + for k, v in (list(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items()) if MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING else []) + + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else []) + } + + def get_test_pipeline(self, model, tokenizer, feature_extractor): + if tokenizer is None: + # Side effect of no Fast Tokenizer class for these model, so skipping + # But the slow tokenizer test should still run as they're quite small + self.skipTest("No tokenizer available") + return + # return None, None + + speech_recognizer = AutomaticSpeechRecognitionPipeline( + model=model, tokenizer=tokenizer, feature_extractor=feature_extractor + ) + + # test with a raw waveform + audio = np.zeros((34000,)) + audio2 = np.zeros((14000,)) + return speech_recognizer, [audio, audio2] + + def run_pipeline_test(self, speech_recognizer, examples): + audio = np.zeros((34000,)) + outputs = speech_recognizer(audio) + self.assertEqual(outputs, {"text": ANY(str)}) + @require_torch @slow def test_pt_defaults(self): pipeline("automatic-speech-recognition", framework="pt") @require_torch - def test_torch_small(self): + def test_small_model_pt(self): import numpy as np speech_recognizer = pipeline( @@ -46,6 +91,10 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): output = speech_recognizer(waveform) self.assertEqual(output, {"text": "(Applaudissements)"}) + @require_tf + def test_small_model_tf(self): + self.skipTest("Tensorflow not supported yet.") + @require_torch def test_torch_small_no_tokenizer_files(self): # test that model without tokenizer file cannot be loaded diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index d727fdbb7c..e64d4b8c09 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import importlib import logging +import random import string import unittest from abc import abstractmethod @@ -21,6 +23,7 @@ from functools import lru_cache from unittest import skipIf from transformers import FEATURE_EXTRACTOR_MAPPING, TOKENIZER_MAPPING, AutoFeatureExtractor, AutoTokenizer, pipeline +from transformers.pipelines.base import _pad from transformers.testing_utils import is_pipeline_test, require_torch @@ -73,6 +76,12 @@ def get_tiny_config_from_class(configuration_class): @lru_cache(maxsize=100) def get_tiny_tokenizer_from_checkpoint(checkpoint): tokenizer = AutoTokenizer.from_pretrained(checkpoint) + if tokenizer.vocab_size < 300: + # Wav2Vec2ForCTC for instance + # ByT5Tokenizer + # all are already small enough and have no Fast version that can + # be retrained + return tokenizer logger.info("Training new from iterator ...") vocabulary = string.ascii_letters + string.digits + " " tokenizer = tokenizer.train_new_from_iterator(vocabulary, vocab_size=len(vocabulary), show_progress=False) @@ -87,6 +96,12 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config): feature_extractor = None if hasattr(tiny_config, "image_size") and feature_extractor: feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size) + + # Speech2TextModel specific. + if hasattr(tiny_config, "input_feat_per_channel") and feature_extractor: + feature_extractor = feature_extractor.__class__( + feature_size=tiny_config.input_feat_per_channel, num_mel_bins=tiny_config.input_feat_per_channel + ) return feature_extractor @@ -136,7 +151,26 @@ class PipelineTestCaseMeta(type): else: tokenizer = None feature_extractor = get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config) - self.run_pipeline_test(model, tokenizer, feature_extractor) + pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor) + if pipeline is None: + # The test can disable itself, but it should be very marginal + # Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist) + return + self.run_pipeline_test(pipeline, examples) + + def run_batch_test(pipeline, examples): + # Need to copy because `Conversation` are stateful + if pipeline.tokenizer is not None and pipeline.tokenizer.pad_token_id is None: + return # No batching for this and it's OK + + # 10 examples with batch size 4 means there needs to be a unfinished batch + # which is important for the unbatcher + dataset = [copy.deepcopy(random.choice(examples)) for i in range(10)] + + for item in pipeline(dataset, batch_size=4): + pass + + run_batch_test(pipeline, examples) return test @@ -211,3 +245,85 @@ class CommonPipelineTest(unittest.TestCase): dataset = MyDataset() for output in text_classifier(dataset): self.assertEqual(output, {"label": ANY(str), "score": ANY(float)}) + + +@is_pipeline_test +class PipelinePadTest(unittest.TestCase): + @require_torch + def test_pipeline_padding(self): + import torch + + items = [ + { + "label": "label1", + "input_ids": torch.LongTensor([[1, 23, 24, 2]]), + "attention_mask": torch.LongTensor([[0, 1, 1, 0]]), + }, + { + "label": "label2", + "input_ids": torch.LongTensor([[1, 23, 24, 43, 44, 2]]), + "attention_mask": torch.LongTensor([[0, 1, 1, 1, 1, 0]]), + }, + ] + + self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"]) + self.assertTrue( + torch.allclose( + _pad(items, "input_ids", 10, "right"), + torch.LongTensor([[1, 23, 24, 2, 10, 10], [1, 23, 24, 43, 44, 2]]), + ) + ) + self.assertTrue( + torch.allclose( + _pad(items, "input_ids", 10, "left"), + torch.LongTensor([[10, 10, 1, 23, 24, 2], [1, 23, 24, 43, 44, 2]]), + ) + ) + self.assertTrue( + torch.allclose( + _pad(items, "attention_mask", 0, "right"), torch.LongTensor([[0, 1, 1, 0, 0, 0], [0, 1, 1, 1, 1, 0]]) + ) + ) + + @require_torch + def test_pipeline_image_padding(self): + import torch + + items = [ + { + "label": "label1", + "pixel_values": torch.zeros((1, 3, 10, 10)), + }, + { + "label": "label2", + "pixel_values": torch.zeros((1, 3, 10, 10)), + }, + ] + + self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"]) + self.assertTrue( + torch.allclose( + _pad(items, "pixel_values", 10, "right"), + torch.zeros((2, 3, 10, 10)), + ) + ) + + @require_torch + def test_pipeline_offset_mapping(self): + import torch + + items = [ + { + "offset_mappings": torch.zeros([1, 11, 2], dtype=torch.long), + }, + { + "offset_mappings": torch.zeros([1, 4, 2], dtype=torch.long), + }, + ] + + self.assertTrue( + torch.allclose( + _pad(items, "offset_mappings", 0, "right"), + torch.zeros((2, 11, 2), dtype=torch.long), + ), + ) diff --git a/tests/test_pipelines_conversational.py b/tests/test_pipelines_conversational.py index 5d7e1fd3a4..342a09e2e6 100644 --- a/tests/test_pipelines_conversational.py +++ b/tests/test_pipelines_conversational.py @@ -54,8 +54,11 @@ class ConversationalPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM else [] ) - def run_pipeline_test(self, model, tokenizer, feature_extractor): + def get_test_pipeline(self, model, tokenizer, feature_extractor): conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) + return conversation_agent, [Conversation("Hi there!")] + + def run_pipeline_test(self, conversation_agent, _): # Simple outputs = conversation_agent(Conversation("Hi there!")) self.assertEqual(outputs, Conversation(past_user_inputs=["Hi there!"], generated_responses=[ANY(str)])) diff --git a/tests/test_pipelines_feature_extraction.py b/tests/test_pipelines_feature_extraction.py index 4a1ed8f969..87b2b10ba5 100644 --- a/tests/test_pipelines_feature_extraction.py +++ b/tests/test_pipelines_feature_extraction.py @@ -14,7 +14,15 @@ import unittest -from transformers import MODEL_MAPPING, TF_MODEL_MAPPING, CLIPConfig, FeatureExtractionPipeline, LxmertConfig, pipeline +from transformers import ( + MODEL_MAPPING, + TF_MODEL_MAPPING, + CLIPConfig, + FeatureExtractionPipeline, + LxmertConfig, + Wav2Vec2Config, + pipeline, +) from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch from .test_pipelines_common import PipelineTestCaseMeta @@ -61,12 +69,12 @@ class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCa raise ValueError("We expect lists of floats, nothing else") return shape - def run_pipeline_test(self, model, tokenizer, feature_extractor): + def get_test_pipeline(self, model, tokenizer, feature_extractor): if tokenizer is None: self.skipTest("No tokenizer") return - elif isinstance(model.config, (LxmertConfig, CLIPConfig)): + elif isinstance(model.config, (LxmertConfig, CLIPConfig, Wav2Vec2Config)): self.skipTest( "This is an Lxmert bimodal model, we need to find a more consistent way to switch on those models." ) @@ -81,11 +89,12 @@ class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCa ) return - feature_extractor = FeatureExtractionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor ) + return feature_extractor, ["This is a test", "This is another test"] + def run_pipeline_test(self, feature_extractor, examples): outputs = feature_extractor("This is a test") shape = self.get_shape(outputs) diff --git a/tests/test_pipelines_fill_mask.py b/tests/test_pipelines_fill_mask.py index fb48fe52cd..43801ef0c1 100644 --- a/tests/test_pipelines_fill_mask.py +++ b/tests/test_pipelines_fill_mask.py @@ -159,22 +159,32 @@ class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="pt") unmasker.tokenizer.pad_token_id = None unmasker.tokenizer.pad_token = None - self.run_pipeline_test(unmasker.model, unmasker.tokenizer, None) + self.run_pipeline_test(unmasker, []) @require_tf def test_model_no_pad_tf(self): unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="tf") unmasker.tokenizer.pad_token_id = None unmasker.tokenizer.pad_token = None - self.run_pipeline_test(unmasker.model, unmasker.tokenizer, None) + self.run_pipeline_test(unmasker, []) - def run_pipeline_test(self, model, tokenizer, feature_extractor): + def get_test_pipeline(self, model, tokenizer, feature_extractor): if tokenizer is None or tokenizer.mask_token_id is None: self.skipTest("The provided tokenizer has no mask token, (probably reformer or wav2vec2)") fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer) + examples = [ + f"This is another {tokenizer.mask_token} test", + ] + return fill_masker, examples - outputs = fill_masker(f"This is a {tokenizer.mask_token}") + def run_pipeline_test(self, fill_masker, examples): + tokenizer = fill_masker.tokenizer + model = fill_masker.model + + outputs = fill_masker( + f"This is a {tokenizer.mask_token}", + ) self.assertEqual( outputs, [ diff --git a/tests/test_pipelines_image_classification.py b/tests/test_pipelines_image_classification.py index e06dec9d0b..81cdde3f3f 100644 --- a/tests/test_pipelines_image_classification.py +++ b/tests/test_pipelines_image_classification.py @@ -44,9 +44,17 @@ else: class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING - @require_datasets - def run_pipeline_test(self, model, tokenizer, feature_extractor): + def get_test_pipeline(self, model, tokenizer, feature_extractor): + image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor) + examples = [ + Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), + "http://images.cocodataset.org/val2017/000000039769.jpg", + ] + return image_classifier, examples + + @require_datasets + def run_pipeline_test(self, image_classifier, examples): outputs = image_classifier("./tests/fixtures/tests_samples/COCO/000000039769.png") self.assertEqual( diff --git a/tests/test_pipelines_object_detection.py b/tests/test_pipelines_object_detection.py index b20bebbf25..19f3447e97 100644 --- a/tests/test_pipelines_object_detection.py +++ b/tests/test_pipelines_object_detection.py @@ -53,9 +53,12 @@ else: class ObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING - @require_datasets - def run_pipeline_test(self, model, tokenizer, feature_extractor): + def get_test_pipeline(self, model, tokenizer, feature_extractor): object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor) + return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"] + + @require_datasets + def run_pipeline_test(self, object_detector, examples): outputs = object_detector("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0) self.assertGreater(len(outputs), 0) diff --git a/tests/test_pipelines_question_answering.py b/tests/test_pipelines_question_answering.py index cd0e7acde1..ca85c89ac9 100644 --- a/tests/test_pipelines_question_answering.py +++ b/tests/test_pipelines_question_answering.py @@ -32,13 +32,20 @@ class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING tf_model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING - def run_pipeline_test(self, model, tokenizer, feature_extractor): + def get_test_pipeline(self, model, tokenizer, feature_extractor): if isinstance(model.config, LxmertConfig): # This is an bimodal model, we need to find a more consistent way # to switch on those models. - return + return None, None question_answerer = QuestionAnsweringPipeline(model, tokenizer) + examples = [ + {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."}, + {"question": "In what field is HuggingFace ?", "context": "HuggingFace is an AI startup."}, + ] + return question_answerer, examples + + def run_pipeline_test(self, question_answerer, _): outputs = question_answerer( question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." ) diff --git a/tests/test_pipelines_summarization.py b/tests/test_pipelines_summarization.py index f3f77410c7..e434ed742d 100644 --- a/tests/test_pipelines_summarization.py +++ b/tests/test_pipelines_summarization.py @@ -36,8 +36,12 @@ class SummarizationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMe model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING - def run_pipeline_test(self, model, tokenizer, feature_extractor): + def get_test_pipeline(self, model, tokenizer, feature_extractor): summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer) + return summarizer, ["(CNN)The Palestinian Authority officially became", "Some other text"] + + def run_pipeline_test(self, summarizer, _): + model = summarizer.model outputs = summarizer("(CNN)The Palestinian Authority officially became") self.assertEqual(outputs, [{"summary_text": ANY(str)}]) diff --git a/tests/test_pipelines_text2text_generation.py b/tests/test_pipelines_text2text_generation.py index 2ca6f93e9f..7de2b263b2 100644 --- a/tests/test_pipelines_text2text_generation.py +++ b/tests/test_pipelines_text2text_generation.py @@ -30,9 +30,11 @@ class Text2TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTest model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING - def run_pipeline_test(self, model, tokenizer, feature_extractor): + def get_test_pipeline(self, model, tokenizer, feature_extractor): generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer) + return generator, ["Something to write", "Something else"] + def run_pipeline_test(self, generator, _): outputs = generator("Something there") self.assertEqual(outputs, [{"generated_text": ANY(str)}]) # These are encoder decoder, they don't just append to incoming string diff --git a/tests/test_pipelines_text_classification.py b/tests/test_pipelines_text_classification.py index 3daad11597..7bc794b4d3 100644 --- a/tests/test_pipelines_text_classification.py +++ b/tests/test_pipelines_text_classification.py @@ -72,9 +72,12 @@ class TextClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestC outputs = text_classifier("Birds are a type of animal") self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}]) - def run_pipeline_test(self, model, tokenizer, feature_extractor): + def get_test_pipeline(self, model, tokenizer, feature_extractor): text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer) + return text_classifier, ["HuggingFace is in", "This is another test"] + def run_pipeline_test(self, text_classifier, _): + model = text_classifier.model # Small inputs because BartTokenizer tiny has maximum position embeddings = 22 valid_inputs = "HuggingFace is in" outputs = text_classifier(valid_inputs) diff --git a/tests/test_pipelines_text_generation.py b/tests/test_pipelines_text_generation.py index 3618a2be73..ebe71a5591 100644 --- a/tests/test_pipelines_text_generation.py +++ b/tests/test_pipelines_text_generation.py @@ -88,8 +88,14 @@ class TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM ], ) - def run_pipeline_test(self, model, tokenizer, feature_extractor): + def get_test_pipeline(self, model, tokenizer, feature_extractor): text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer) + return text_generator, ["This is a test", "Another test"] + + def run_pipeline_test(self, text_generator, _): + model = text_generator.model + tokenizer = text_generator.tokenizer + outputs = text_generator("This is a test") self.assertEqual(outputs, [{"generated_text": ANY(str)}]) self.assertTrue(outputs[0]["generated_text"].startswith("This is a test")) diff --git a/tests/test_pipelines_token_classification.py b/tests/test_pipelines_token_classification.py index d94e4cc7f8..caeef47d95 100644 --- a/tests/test_pipelines_token_classification.py +++ b/tests/test_pipelines_token_classification.py @@ -45,8 +45,13 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING tf_model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING - def run_pipeline_test(self, model, tokenizer, feature_extractor): + def get_test_pipeline(self, model, tokenizer, feature_extractor): token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer) + return token_classifier, ["A simple string", "A simple string that is quite a bit longer"] + + def run_pipeline_test(self, token_classifier, _): + model = token_classifier.model + tokenizer = token_classifier.tokenizer outputs = token_classifier("A simple string") self.assertIsInstance(outputs, list) diff --git a/tests/test_pipelines_translation.py b/tests/test_pipelines_translation.py index 50b13331f3..7185fe9c18 100644 --- a/tests/test_pipelines_translation.py +++ b/tests/test_pipelines_translation.py @@ -20,6 +20,7 @@ from transformers import ( MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, MBart50TokenizerFast, + MBartConfig, MBartForConditionalGeneration, TranslationPipeline, pipeline, @@ -34,14 +35,16 @@ class TranslationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING - def run_pipeline_test(self, model, tokenizer, feature_extractor): - translator = TranslationPipeline(model=model, tokenizer=tokenizer) - try: - outputs = translator("Some string") - except ValueError: - # Triggered by m2m langages - src_lang, tgt_lang = list(translator.tokenizer.lang_code_to_id.keys())[:2] - outputs = translator("Some string", src_lang=src_lang, tgt_lang=tgt_lang) + def get_test_pipeline(self, model, tokenizer, feature_extractor): + if isinstance(model.config, MBartConfig): + src_lang, tgt_lang = list(tokenizer.lang_code_to_id.keys())[:2] + translator = TranslationPipeline(model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang) + else: + translator = TranslationPipeline(model=model, tokenizer=tokenizer) + return translator, ["Some string", "Some other text"] + + def run_pipeline_test(self, translator, _): + outputs = translator("Some string") self.assertEqual(outputs, [{"translation_text": ANY(str)}]) @require_torch diff --git a/tests/test_pipelines_zero_shot.py b/tests/test_pipelines_zero_shot.py index ae47eb626c..ed564581e5 100644 --- a/tests/test_pipelines_zero_shot.py +++ b/tests/test_pipelines_zero_shot.py @@ -31,9 +31,13 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase, metaclass=PipelineT model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING tf_model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING - def run_pipeline_test(self, model, tokenizer, feature_extractor): - classifier = ZeroShotClassificationPipeline(model=model, tokenizer=tokenizer) + def get_test_pipeline(self, model, tokenizer, feature_extractor): + classifier = ZeroShotClassificationPipeline( + model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"] + ) + return classifier, ["Who are you voting for in 2020?", "My stomach hurts."] + def run_pipeline_test(self, classifier, _): outputs = classifier("Who are you voting for in 2020?", candidate_labels="politics") self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})