Adding batch_size support for (almost) all pipelines (#13724)
* Tentative enabling of `batch_size` for pipelines. * Add systematic test for pipeline batching. * Enabling batch_size on almost all pipelines - Not `zero-shot` (it's already passing stuff as batched so trickier) - Not `QA` (preprocess uses squad features, we need to switch to real tensors at this boundary. * Adding `min_length_for_response` for conversational. * Making CTC, speech mappings avaiable regardless of framework. * Attempt at fixing automatic tests (ffmpeg not enabled for fast tests) * Removing ffmpeg dependency in tests. * Small fixes. * Slight cleanup. * Adding docs and adressing comments. * Quality. * Update docs/source/main_classes/pipelines.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/question_answering.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/zero_shot_classification.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Improving docs. * Update docs/source/main_classes/pipelines.rst Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com> * N -> oberved_batch_size softmax trick. * Follow `padding_side`. * Supporting image pipeline batching (and padding). * Rename `unbatch` -> `loader_batch`. * unbatch_size forgot. * Custom padding for offset mappings. * Attempt to remove librosa. * Adding require_audio. * torchaudio. * Back to using datasets librosa. * Adding help to set a pad_token on the tokenizer. * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Quality. Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
This commit is contained in:
@@ -71,6 +71,11 @@ GPU. If it doesn't don't hesitate to create an issue.
|
||||
|
||||
.. code-block::
|
||||
|
||||
import datasets
|
||||
from transformers import pipeline
|
||||
from transformers.pipelines.base import KeyDataset
|
||||
import tqdm
|
||||
|
||||
pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
|
||||
dataset = datasets.load_dataset("superb", name="asr", split="test")
|
||||
|
||||
@@ -85,6 +90,144 @@ GPU. If it doesn't don't hesitate to create an issue.
|
||||
|
||||
.. autofunction:: transformers.pipeline
|
||||
|
||||
Pipeline batching
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
All pipelines (except `zero-shot-classification` and `question-answering` currently) can use batching. This will work
|
||||
whenever the pipeline uses its streaming ability (so when passing lists or :obj:`Dataset`).
|
||||
|
||||
.. code-block::
|
||||
|
||||
from transformers import pipeline
|
||||
from transformers.pipelines.base import KeyDataset
|
||||
import datasets
|
||||
import tqdm
|
||||
|
||||
dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
|
||||
pipe = pipeline("text-classification", device=0)
|
||||
for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
|
||||
print(out)
|
||||
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||
# Exactly the same output as before, but the content are passed
|
||||
# as batches to the model
|
||||
|
||||
|
||||
.. warning::
|
||||
|
||||
However, this is not automatically a win for performance. It can be either a 10x speedup or 5x slowdown depending
|
||||
on hardware, data and the actual model being used.
|
||||
|
||||
Example where it's most a speedup:
|
||||
|
||||
|
||||
.. code-block::
|
||||
|
||||
from transformers import pipeline
|
||||
from torch.utils.data import Dataset
|
||||
import tqdm
|
||||
|
||||
|
||||
pipe = pipeline("text-classification", device=0)
|
||||
|
||||
|
||||
class MyDataset(Dataset):
|
||||
def __len__(self):
|
||||
return 5000
|
||||
|
||||
def __getitem__(self, i):
|
||||
return "This is a test"
|
||||
|
||||
|
||||
dataset = MyDataset()
|
||||
|
||||
for batch_size in [1, 8, 64, 256]:
|
||||
print("-" * 30)
|
||||
print(f"Streaming batch_size={batch_size}")
|
||||
for out in tqdm.tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
|
||||
pass
|
||||
|
||||
|
||||
.. code-block::
|
||||
|
||||
# On GTX 970
|
||||
------------------------------
|
||||
Streaming no batching
|
||||
100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s]
|
||||
------------------------------
|
||||
Streaming batch_size=8
|
||||
100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s]
|
||||
------------------------------
|
||||
Streaming batch_size=64
|
||||
100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s]
|
||||
------------------------------
|
||||
Streaming batch_size=256
|
||||
100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s]
|
||||
(diminishing returns, saturated the GPU)
|
||||
|
||||
|
||||
Example where it's most a slowdown:
|
||||
|
||||
.. code-block::
|
||||
|
||||
class MyDataset(Dataset):
|
||||
def __len__(self):
|
||||
return 5000
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i % 64 == 0:
|
||||
n = 100
|
||||
else:
|
||||
n = 1
|
||||
return "This is a test" * n
|
||||
|
||||
This is a occasional very long sentence compared to the other. In that case, the **whole** batch will need to be 400
|
||||
tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on
|
||||
bigger batches, the program simply crashes.
|
||||
|
||||
|
||||
.. code-block::
|
||||
|
||||
------------------------------
|
||||
Streaming no batching
|
||||
100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
|
||||
------------------------------
|
||||
Streaming batch_size=8
|
||||
100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s]
|
||||
------------------------------
|
||||
Streaming batch_size=64
|
||||
100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s]
|
||||
------------------------------
|
||||
Streaming batch_size=256
|
||||
0%| | 0/1000 [00:00<?, ?it/s]
|
||||
Traceback (most recent call last):
|
||||
File "/home/nicolas/src/transformers/test.py", line 42, in <module>
|
||||
for out in tqdm.tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
|
||||
....
|
||||
q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head)
|
||||
RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)
|
||||
|
||||
|
||||
There are no good (general) solutions for this problem, and your mileage may vary depending on your use cases. Rule of
|
||||
thumb:
|
||||
|
||||
For users, a rule of thumb is:
|
||||
|
||||
- **Measure performance on your load, with your hardware. Measure, measure, and keep measuring. Real numbers are the
|
||||
only way to go.**
|
||||
- If you are latency constrained (live product doing inference), don't batch
|
||||
- If you are using CPU, don't batch.
|
||||
- If you are using throughput (you want to run your model on a bunch of static data), on GPU, then:
|
||||
|
||||
- If you have no clue about the size of the sequence_length ("natural" data), by default don't batch, measure and
|
||||
try tentatively to add it, add OOM checks to recover when it will fail (and it will at some point if you don't
|
||||
control the sequence_length.)
|
||||
- If your sequence_length is super regular, then batching is more likely to be VERY interesting, measure and push
|
||||
it until you get OOMs.
|
||||
- The larger the GPU the more likely batching is going to be more interesting
|
||||
- As soon as you enable batching, make sure you can handle OOMs nicely.
|
||||
|
||||
|
||||
|
||||
Implementing a pipeline
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
||||
@@ -584,6 +584,7 @@ if is_torch_available():
|
||||
[
|
||||
"MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
|
||||
"MODEL_FOR_CAUSAL_LM_MAPPING",
|
||||
"MODEL_FOR_CTC_MAPPING",
|
||||
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
|
||||
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
|
||||
"MODEL_FOR_MASKED_LM_MAPPING",
|
||||
@@ -594,6 +595,7 @@ if is_torch_available():
|
||||
"MODEL_FOR_QUESTION_ANSWERING_MAPPING",
|
||||
"MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
|
||||
"MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
|
||||
"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
|
||||
"MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
|
||||
"MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
|
||||
"MODEL_MAPPING",
|
||||
@@ -2430,6 +2432,7 @@ if TYPE_CHECKING:
|
||||
from .models.auto import (
|
||||
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING,
|
||||
MODEL_FOR_CTC_MAPPING,
|
||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
||||
MODEL_FOR_MASKED_LM_MAPPING,
|
||||
@@ -2440,6 +2443,7 @@ if TYPE_CHECKING:
|
||||
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
|
||||
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
||||
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
|
||||
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
||||
MODEL_MAPPING,
|
||||
|
||||
@@ -169,6 +169,10 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
|
||||
elif model_class in MODEL_FOR_CTC_MAPPING.values():
|
||||
outputs = self.model(**model_inputs)
|
||||
tokens = outputs.logits.squeeze(0).argmax(dim=-1)
|
||||
else:
|
||||
logger.warning("This is an unknown class, treating it as CTC.")
|
||||
outputs = self.model(**model_inputs)
|
||||
tokens = outputs.logits.squeeze(0).argmax(dim=-1)
|
||||
return tokens
|
||||
|
||||
def postprocess(self, model_outputs):
|
||||
|
||||
@@ -25,6 +25,7 @@ from contextlib import contextmanager
|
||||
from os.path import abspath, exists
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from packaging import version
|
||||
|
||||
from ..feature_extraction_utils import PreTrainedFeatureExtractor
|
||||
@@ -59,12 +60,80 @@ if TYPE_CHECKING:
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def collate_fn(items):
|
||||
def no_collate_fn(items):
|
||||
if len(items) != 1:
|
||||
raise ValueError("This collate_fn is meant to be used with batch_size=1")
|
||||
return items[0]
|
||||
|
||||
|
||||
def _pad(items, key, padding_value, padding_side):
|
||||
batch_size = len(items)
|
||||
if isinstance(items[0][key], torch.Tensor):
|
||||
# Others include `attention_mask` etc...
|
||||
shape = items[0][key].shape
|
||||
dim = len(shape)
|
||||
if dim == 4:
|
||||
# This is probable image so padding shouldn't be necessary
|
||||
# B, C, H, W
|
||||
return torch.cat([item[key] for item in items], dim=0)
|
||||
max_length = max(item[key].shape[1] for item in items)
|
||||
dtype = items[0][key].dtype
|
||||
|
||||
if dim == 2:
|
||||
tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
|
||||
elif dim == 3:
|
||||
tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
|
||||
|
||||
for i, item in enumerate(items):
|
||||
if dim == 2:
|
||||
if padding_side == "left":
|
||||
tensor[i, -len(item[key][0]) :] = item[key][0].clone()
|
||||
else:
|
||||
tensor[i, : len(item[key][0])] = item[key][0].clone()
|
||||
elif dim == 3:
|
||||
if padding_side == "left":
|
||||
tensor[i, -len(item[key][0]) :, :] = item[key][0].clone()
|
||||
else:
|
||||
tensor[i, : len(item[key][0]), :] = item[key][0].clone()
|
||||
return tensor
|
||||
else:
|
||||
return [item[key] for item in items]
|
||||
|
||||
|
||||
def pad_collate_fn(tokenizer, feature_extractor):
|
||||
padding_side = "right"
|
||||
if tokenizer is None and feature_extractor is None:
|
||||
raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching")
|
||||
if tokenizer is not None:
|
||||
if tokenizer.pad_token_id is None:
|
||||
raise ValueError(
|
||||
"Pipeline with tokenizer without pad_token cannot do batching. You can try to set it with "
|
||||
"`pipe.tokenizer.pad_token_id = model.config.eos_token_id`."
|
||||
)
|
||||
else:
|
||||
padding_value = tokenizer.pad_token_id
|
||||
padding_side = tokenizer.padding_side
|
||||
if feature_extractor is not None:
|
||||
# Feature extractor can be images, where no padding is expected
|
||||
padding_value = getattr(feature_extractor, "padding_value", None)
|
||||
padding_side = getattr(feature_extractor, "padding_side", None)
|
||||
|
||||
def inner(items):
|
||||
keys = set(items[0].keys())
|
||||
for item in items:
|
||||
if set(item.keys()) != keys:
|
||||
raise ValueError(
|
||||
f"The elements of the batch contain different keys. Cannot batch them ({set(item.keys())} != {keys})"
|
||||
)
|
||||
# input_values, input_pixels, input_ids, ...
|
||||
padded = {
|
||||
key: _pad(items, key, padding_value if key.startswith("input_") else 0, padding_side) for key in keys
|
||||
}
|
||||
return padded
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
def infer_framework_load_model(
|
||||
model,
|
||||
config: AutoConfig,
|
||||
@@ -591,6 +660,13 @@ PIPELINE_INIT_ARGS = r"""
|
||||
is provided.
|
||||
task (:obj:`str`, defaults to :obj:`""`):
|
||||
A task-identifier for the pipeline.
|
||||
num_workers (:obj:`int`, `optional`, defaults to 8):
|
||||
When the pipeline will use `DataLoader` (when passing a dataset, on GPU for a Pytorch model), the number of
|
||||
workers to be used.
|
||||
batch_size (:obj:`int`, `optional`, defaults to 1):
|
||||
When the pipeline will use `DataLoader` (when passing a dataset, on GPU for a Pytorch model), the size of
|
||||
the batch to use, for inference this is not always beneficial, please read `Batching with pipelines
|
||||
<https://huggingface.co/transformers/main_classes/pipelines.html#pipeline-batching>`_ .
|
||||
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
|
||||
Reference to the object in charge of parsing supplied pipeline parameters.
|
||||
device (:obj:`int`, `optional`, defaults to -1):
|
||||
@@ -617,10 +693,44 @@ if is_torch_available():
|
||||
return processed
|
||||
|
||||
class PipelineIterator(IterableDataset):
|
||||
def __init__(self, loader, infer, params):
|
||||
def __init__(self, loader, infer, params, loader_batch_size=None):
|
||||
"""
|
||||
Roughly equivalent to
|
||||
|
||||
.. code-block::
|
||||
for item in loader:
|
||||
yield infer(item, **params)
|
||||
|
||||
Arguments:
|
||||
loader (:obj:`torch.utils.data.DataLoader` or any iterator):
|
||||
The iterator that will be used to apply :obj:`infer` on.
|
||||
infer (any function):
|
||||
The function to apply of each element of :obj:`loader`.
|
||||
params (:obj:`dict`):
|
||||
The parameters passed to :obj:`infer` along with every item
|
||||
loader_batch_size (:obj:`int`, `optional`):
|
||||
If specified, the items of :obj:`loader` are supposed to come as batch, and are loader_batched here
|
||||
making it roughly behave as
|
||||
|
||||
|
||||
.. code-block::
|
||||
|
||||
for items in loader:
|
||||
for i in loader_batch_size:
|
||||
item = items[i]
|
||||
yield infer(item, **params)
|
||||
"""
|
||||
self.loader = loader
|
||||
self.infer = infer
|
||||
self.params = params
|
||||
if loader_batch_size == 1:
|
||||
# Let's spare some time by deactivating altogether
|
||||
loader_batch_size = None
|
||||
self.loader_batch_size = loader_batch_size
|
||||
|
||||
# Internal bookkeeping
|
||||
self._loader_batch_index = None
|
||||
self._loader_batch_data = None
|
||||
|
||||
def __len__(self):
|
||||
return len(self.loader)
|
||||
@@ -629,9 +739,48 @@ if is_torch_available():
|
||||
self.iterator = iter(self.loader)
|
||||
return self
|
||||
|
||||
def loader_batch_item(self):
|
||||
if isinstance(self._loader_batch_data, torch.Tensor):
|
||||
result = self._loader_batch_data[self._loader_batch_index]
|
||||
else:
|
||||
loader_batched = {}
|
||||
for k, element in self._loader_batch_data.items():
|
||||
if k == "past_key_values":
|
||||
continue
|
||||
if isinstance(element[self._loader_batch_index], torch.Tensor):
|
||||
loader_batched[k] = element[self._loader_batch_index].unsqueeze(0)
|
||||
elif isinstance(element[self._loader_batch_index], np.ndarray):
|
||||
loader_batched[k] = np.expand_dims(element[self._loader_batch_index], 0)
|
||||
else:
|
||||
loader_batched[k] = element[self._loader_batch_index]
|
||||
result = self._loader_batch_data.__class__(loader_batched)
|
||||
self._loader_batch_index += 1
|
||||
return result
|
||||
|
||||
def __next__(self):
|
||||
if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
|
||||
return self.loader_batch_item()
|
||||
|
||||
item = next(self.iterator)
|
||||
processed = self.infer(item, **self.params)
|
||||
if self.loader_batch_size is not None:
|
||||
if isinstance(processed, torch.Tensor):
|
||||
first_tensor = processed
|
||||
else:
|
||||
key = list(processed.keys())[0]
|
||||
first_tensor = processed[key]
|
||||
if isinstance(first_tensor, list):
|
||||
observed_batch_size = len(first_tensor)
|
||||
else:
|
||||
observed_batch_size = first_tensor.shape[0]
|
||||
if 0 < observed_batch_size < self.loader_batch_size:
|
||||
# Could be last batch so we can't unroll as many
|
||||
# elements.
|
||||
self.loader_batch_size = observed_batch_size
|
||||
self._loader_batch_data = processed
|
||||
self._loader_batch_index = 0
|
||||
return self.loader_batch_item()
|
||||
else:
|
||||
return processed
|
||||
|
||||
class KeyDataset(Dataset):
|
||||
@@ -881,17 +1030,20 @@ class Pipeline(_ScikitCompat):
|
||||
raise ValueError(f"Framework {self.framework} is not supported")
|
||||
return model_outputs
|
||||
|
||||
def get_iterator(self, inputs, num_workers: int, preprocess_params, forward_params, postprocess_params):
|
||||
def get_iterator(
|
||||
self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
|
||||
):
|
||||
if "TOKENIZERS_PARALLELISM" not in os.environ:
|
||||
logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
dataset = PipelineDataset(inputs, self.preprocess, preprocess_params)
|
||||
dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=1, collate_fn=collate_fn)
|
||||
model_iterator = PipelineIterator(dataloader, self.forward, forward_params)
|
||||
collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, self.feature_extractor)
|
||||
dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
|
||||
model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
|
||||
final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
|
||||
return final_iterator
|
||||
|
||||
def __call__(self, inputs, *args, num_workers=8, **kwargs):
|
||||
def __call__(self, inputs, *args, num_workers=0, batch_size=1, **kwargs):
|
||||
if args:
|
||||
logger.warning(f"Ignoring args : {args}")
|
||||
preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs)
|
||||
@@ -910,14 +1062,16 @@ class Pipeline(_ScikitCompat):
|
||||
if isinstance(inputs, list):
|
||||
if self.framework == "pt":
|
||||
final_iterator = self.get_iterator(
|
||||
inputs, num_workers, preprocess_params, forward_params, postprocess_params
|
||||
inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
|
||||
)
|
||||
outputs = [output for output in final_iterator]
|
||||
return outputs
|
||||
else:
|
||||
return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
|
||||
elif Dataset is not None and isinstance(inputs, Dataset):
|
||||
return self.get_iterator(inputs, num_workers, preprocess_params, forward_params, postprocess_params)
|
||||
return self.get_iterator(
|
||||
inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
|
||||
)
|
||||
else:
|
||||
return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
|
||||
|
||||
|
||||
@@ -243,7 +243,7 @@ class ConversationalPipeline(Pipeline):
|
||||
return outputs[0]
|
||||
return outputs
|
||||
|
||||
def preprocess(self, conversation: Conversation) -> Dict[str, Any]:
|
||||
def preprocess(self, conversation: Conversation, min_length_for_response=32) -> Dict[str, Any]:
|
||||
if not isinstance(conversation, Conversation):
|
||||
raise ValueError("ConversationalPipeline, expects Conversation as inputs")
|
||||
if conversation.new_user_input is None:
|
||||
@@ -274,18 +274,18 @@ class ConversationalPipeline(Pipeline):
|
||||
if "attention_mask" in model_inputs:
|
||||
model_inputs["attention_mask"] = model_inputs["attention_mask"][:, -trim:]
|
||||
conversation = model_inputs.pop("conversation")
|
||||
model_inputs["max_length"] = max_length
|
||||
generate_kwargs["max_length"] = max_length
|
||||
output_ids = self.model.generate(**model_inputs, **generate_kwargs)
|
||||
if self.model.config.is_encoder_decoder:
|
||||
start_position = 1
|
||||
else:
|
||||
start_position = n
|
||||
return {"output_ids": output_ids[0, start_position:], "conversation": conversation}
|
||||
return {"output_ids": output_ids[:, start_position:], "conversation": conversation}
|
||||
|
||||
def postprocess(self, model_outputs, clean_up_tokenization_spaces=True):
|
||||
output_ids = model_outputs["output_ids"]
|
||||
answer = self.tokenizer.decode(
|
||||
output_ids,
|
||||
output_ids[0],
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
)
|
||||
|
||||
@@ -89,14 +89,14 @@ class FillMaskPipeline(Pipeline):
|
||||
|
||||
def _forward(self, model_inputs):
|
||||
model_outputs = self.model(**model_inputs)
|
||||
model_outputs["input_ids"] = model_inputs["input_ids"][0]
|
||||
model_outputs["input_ids"] = model_inputs["input_ids"]
|
||||
return model_outputs
|
||||
|
||||
def postprocess(self, model_outputs, top_k=5, target_ids=None):
|
||||
# Cap top_k if there are targets
|
||||
if target_ids is not None and target_ids.shape[0] < top_k:
|
||||
top_k = target_ids.shape[0]
|
||||
input_ids = model_outputs["input_ids"]
|
||||
input_ids = model_outputs["input_ids"][0]
|
||||
outputs = model_outputs["logits"]
|
||||
result = []
|
||||
|
||||
|
||||
@@ -114,11 +114,12 @@ class ObjectDetectionPipeline(Pipeline):
|
||||
def _forward(self, model_inputs):
|
||||
target_size = model_inputs.pop("target_size")
|
||||
outputs = self.model(**model_inputs)
|
||||
model_outputs = {"outputs": outputs, "target_size": target_size}
|
||||
model_outputs = outputs.__class__({"target_size": target_size, **outputs})
|
||||
return model_outputs
|
||||
|
||||
def postprocess(self, model_outputs, threshold=0.9):
|
||||
raw_annotations = self.feature_extractor.post_process(model_outputs["outputs"], model_outputs["target_size"])
|
||||
target_size = model_outputs["target_size"]
|
||||
raw_annotations = self.feature_extractor.post_process(model_outputs, target_size)
|
||||
raw_annotation = raw_annotations[0]
|
||||
keep = raw_annotation["scores"] > threshold
|
||||
scores = raw_annotation["scores"][keep]
|
||||
|
||||
@@ -8,9 +8,12 @@ from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_featur
|
||||
from ..file_utils import PaddingStrategy, add_end_docstrings, is_tf_available, is_torch_available
|
||||
from ..modelcard import ModelCard
|
||||
from ..tokenization_utils import PreTrainedTokenizer
|
||||
from ..utils import logging
|
||||
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..modeling_tf_utils import TFPreTrainedModel
|
||||
from ..modeling_utils import PreTrainedModel
|
||||
@@ -241,6 +244,9 @@ class QuestionAnsweringPipeline(Pipeline):
|
||||
- **end** (:obj:`int`) -- The character end index of the answer (in the tokenized version of the input).
|
||||
- **answer** (:obj:`str`) -- The answer to the question.
|
||||
"""
|
||||
if kwargs.get("batch_size", 1) > 1:
|
||||
logger.error("Batch_size > 1 is not supported for question answering pipeline, setting it to 1.")
|
||||
kwargs["batch_size"] = 1
|
||||
|
||||
# Convert inputs to features
|
||||
examples = self._args_parser(*args, **kwargs)
|
||||
|
||||
@@ -204,12 +204,12 @@ class TokenClassificationPipeline(Pipeline):
|
||||
offset_mapping = model_inputs.pop("offset_mapping", None)
|
||||
sentence = model_inputs.pop("sentence")
|
||||
if self.framework == "tf":
|
||||
outputs = self.model(model_inputs.data)[0][0]
|
||||
logits = self.model(model_inputs.data)[0]
|
||||
else:
|
||||
outputs = self.model(**model_inputs)[0][0]
|
||||
logits = self.model(**model_inputs)[0]
|
||||
|
||||
return {
|
||||
"outputs": outputs,
|
||||
"logits": logits,
|
||||
"special_tokens_mask": special_tokens_mask,
|
||||
"offset_mapping": offset_mapping,
|
||||
"sentence": sentence,
|
||||
@@ -217,13 +217,16 @@ class TokenClassificationPipeline(Pipeline):
|
||||
}
|
||||
|
||||
def postprocess(self, model_outputs, aggregation_strategy=AggregationStrategy.NONE):
|
||||
outputs = model_outputs["outputs"].numpy()
|
||||
logits = model_outputs["logits"][0].numpy()
|
||||
sentence = model_outputs["sentence"]
|
||||
input_ids = model_outputs["input_ids"][0]
|
||||
offset_mapping = model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
|
||||
special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy()
|
||||
|
||||
scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
|
||||
maxes = np.max(logits, axis=-1, keepdims=True)
|
||||
shifted_exp = np.exp(logits - maxes)
|
||||
scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
|
||||
|
||||
pre_entities = self.gather_pre_entities(
|
||||
sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
|
||||
)
|
||||
|
||||
@@ -183,6 +183,9 @@ class ZeroShotClassificationPipeline(Pipeline):
|
||||
- **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood.
|
||||
- **scores** (:obj:`List[float]`) -- The probabilities for each of the labels.
|
||||
"""
|
||||
if kwargs.get("batch_size", 1) > 1:
|
||||
logger.error("Batch size > 1 is not supported for zero-shot pipeline, setting batch_size=1.")
|
||||
kwargs["batch_size"] = 1
|
||||
|
||||
if len(args) == 0:
|
||||
pass
|
||||
|
||||
@@ -313,6 +313,9 @@ MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
|
||||
MODEL_FOR_CAUSAL_LM_MAPPING = None
|
||||
|
||||
|
||||
MODEL_FOR_CTC_MAPPING = None
|
||||
|
||||
|
||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
|
||||
|
||||
|
||||
@@ -343,6 +346,9 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None
|
||||
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = None
|
||||
|
||||
|
||||
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = None
|
||||
|
||||
|
||||
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
|
||||
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ from transformers.testing_utils import (
|
||||
require_datasets,
|
||||
require_tf,
|
||||
require_torch,
|
||||
require_torchaudio,
|
||||
slow,
|
||||
)
|
||||
|
||||
@@ -35,15 +36,16 @@ from .test_pipelines_common import ANY, PipelineTestCaseMeta
|
||||
class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||
model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
|
||||
|
||||
@require_datasets
|
||||
@slow
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
import datasets
|
||||
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=feature_extractor)
|
||||
|
||||
# test with a raw waveform
|
||||
audio = np.zeros((34000,))
|
||||
audio2 = np.zeros((14000,))
|
||||
return audio_classifier, [audio2, audio]
|
||||
|
||||
def run_pipeline_test(self, audio_classifier, examples):
|
||||
audio2, audio = examples
|
||||
output = audio_classifier(audio)
|
||||
# by default a model is initialized with num_labels=2
|
||||
self.assertEqual(
|
||||
@@ -61,10 +63,17 @@ class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
|
||||
],
|
||||
)
|
||||
|
||||
self.run_torchaudio(audio_classifier)
|
||||
|
||||
@require_datasets
|
||||
@require_torchaudio
|
||||
def run_torchaudio(self, audio_classifier):
|
||||
import datasets
|
||||
|
||||
# test with a local file
|
||||
dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
filename = dataset[0]["file"]
|
||||
output = audio_classifier(filename)
|
||||
audio = dataset[0]["audio"]["array"]
|
||||
output = audio_classifier(audio)
|
||||
self.assertEqual(
|
||||
output,
|
||||
[
|
||||
|
||||
@@ -14,11 +14,28 @@
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from transformers import AutoFeatureExtractor, AutoTokenizer, Speech2TextForConditionalGeneration, Wav2Vec2ForCTC
|
||||
from transformers import (
|
||||
MODEL_FOR_CTC_MAPPING,
|
||||
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
||||
AutoFeatureExtractor,
|
||||
AutoTokenizer,
|
||||
Speech2TextForConditionalGeneration,
|
||||
Wav2Vec2ForCTC,
|
||||
)
|
||||
from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
|
||||
from transformers.testing_utils import is_pipeline_test, require_datasets, require_torch, require_torchaudio, slow
|
||||
from transformers.testing_utils import (
|
||||
is_pipeline_test,
|
||||
require_datasets,
|
||||
require_tf,
|
||||
require_torch,
|
||||
require_torchaudio,
|
||||
slow,
|
||||
)
|
||||
|
||||
from .test_pipelines_common import ANY, PipelineTestCaseMeta
|
||||
|
||||
|
||||
# We can't use this mixin because it assumes TF support.
|
||||
@@ -26,14 +43,42 @@ from transformers.testing_utils import is_pipeline_test, require_datasets, requi
|
||||
|
||||
|
||||
@is_pipeline_test
|
||||
class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
|
||||
class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||
model_mapping = {
|
||||
k: v
|
||||
for k, v in (list(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items()) if MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING else [])
|
||||
+ (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
|
||||
}
|
||||
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
if tokenizer is None:
|
||||
# Side effect of no Fast Tokenizer class for these model, so skipping
|
||||
# But the slow tokenizer test should still run as they're quite small
|
||||
self.skipTest("No tokenizer available")
|
||||
return
|
||||
# return None, None
|
||||
|
||||
speech_recognizer = AutomaticSpeechRecognitionPipeline(
|
||||
model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
|
||||
)
|
||||
|
||||
# test with a raw waveform
|
||||
audio = np.zeros((34000,))
|
||||
audio2 = np.zeros((14000,))
|
||||
return speech_recognizer, [audio, audio2]
|
||||
|
||||
def run_pipeline_test(self, speech_recognizer, examples):
|
||||
audio = np.zeros((34000,))
|
||||
outputs = speech_recognizer(audio)
|
||||
self.assertEqual(outputs, {"text": ANY(str)})
|
||||
|
||||
@require_torch
|
||||
@slow
|
||||
def test_pt_defaults(self):
|
||||
pipeline("automatic-speech-recognition", framework="pt")
|
||||
|
||||
@require_torch
|
||||
def test_torch_small(self):
|
||||
def test_small_model_pt(self):
|
||||
import numpy as np
|
||||
|
||||
speech_recognizer = pipeline(
|
||||
@@ -46,6 +91,10 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
|
||||
output = speech_recognizer(waveform)
|
||||
self.assertEqual(output, {"text": "(Applaudissements)"})
|
||||
|
||||
@require_tf
|
||||
def test_small_model_tf(self):
|
||||
self.skipTest("Tensorflow not supported yet.")
|
||||
|
||||
@require_torch
|
||||
def test_torch_small_no_tokenizer_files(self):
|
||||
# test that model without tokenizer file cannot be loaded
|
||||
|
||||
@@ -12,8 +12,10 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
import importlib
|
||||
import logging
|
||||
import random
|
||||
import string
|
||||
import unittest
|
||||
from abc import abstractmethod
|
||||
@@ -21,6 +23,7 @@ from functools import lru_cache
|
||||
from unittest import skipIf
|
||||
|
||||
from transformers import FEATURE_EXTRACTOR_MAPPING, TOKENIZER_MAPPING, AutoFeatureExtractor, AutoTokenizer, pipeline
|
||||
from transformers.pipelines.base import _pad
|
||||
from transformers.testing_utils import is_pipeline_test, require_torch
|
||||
|
||||
|
||||
@@ -73,6 +76,12 @@ def get_tiny_config_from_class(configuration_class):
|
||||
@lru_cache(maxsize=100)
|
||||
def get_tiny_tokenizer_from_checkpoint(checkpoint):
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
if tokenizer.vocab_size < 300:
|
||||
# Wav2Vec2ForCTC for instance
|
||||
# ByT5Tokenizer
|
||||
# all are already small enough and have no Fast version that can
|
||||
# be retrained
|
||||
return tokenizer
|
||||
logger.info("Training new from iterator ...")
|
||||
vocabulary = string.ascii_letters + string.digits + " "
|
||||
tokenizer = tokenizer.train_new_from_iterator(vocabulary, vocab_size=len(vocabulary), show_progress=False)
|
||||
@@ -87,6 +96,12 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config):
|
||||
feature_extractor = None
|
||||
if hasattr(tiny_config, "image_size") and feature_extractor:
|
||||
feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
|
||||
|
||||
# Speech2TextModel specific.
|
||||
if hasattr(tiny_config, "input_feat_per_channel") and feature_extractor:
|
||||
feature_extractor = feature_extractor.__class__(
|
||||
feature_size=tiny_config.input_feat_per_channel, num_mel_bins=tiny_config.input_feat_per_channel
|
||||
)
|
||||
return feature_extractor
|
||||
|
||||
|
||||
@@ -136,7 +151,26 @@ class PipelineTestCaseMeta(type):
|
||||
else:
|
||||
tokenizer = None
|
||||
feature_extractor = get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config)
|
||||
self.run_pipeline_test(model, tokenizer, feature_extractor)
|
||||
pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor)
|
||||
if pipeline is None:
|
||||
# The test can disable itself, but it should be very marginal
|
||||
# Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
|
||||
return
|
||||
self.run_pipeline_test(pipeline, examples)
|
||||
|
||||
def run_batch_test(pipeline, examples):
|
||||
# Need to copy because `Conversation` are stateful
|
||||
if pipeline.tokenizer is not None and pipeline.tokenizer.pad_token_id is None:
|
||||
return # No batching for this and it's OK
|
||||
|
||||
# 10 examples with batch size 4 means there needs to be a unfinished batch
|
||||
# which is important for the unbatcher
|
||||
dataset = [copy.deepcopy(random.choice(examples)) for i in range(10)]
|
||||
|
||||
for item in pipeline(dataset, batch_size=4):
|
||||
pass
|
||||
|
||||
run_batch_test(pipeline, examples)
|
||||
|
||||
return test
|
||||
|
||||
@@ -211,3 +245,85 @@ class CommonPipelineTest(unittest.TestCase):
|
||||
dataset = MyDataset()
|
||||
for output in text_classifier(dataset):
|
||||
self.assertEqual(output, {"label": ANY(str), "score": ANY(float)})
|
||||
|
||||
|
||||
@is_pipeline_test
|
||||
class PipelinePadTest(unittest.TestCase):
|
||||
@require_torch
|
||||
def test_pipeline_padding(self):
|
||||
import torch
|
||||
|
||||
items = [
|
||||
{
|
||||
"label": "label1",
|
||||
"input_ids": torch.LongTensor([[1, 23, 24, 2]]),
|
||||
"attention_mask": torch.LongTensor([[0, 1, 1, 0]]),
|
||||
},
|
||||
{
|
||||
"label": "label2",
|
||||
"input_ids": torch.LongTensor([[1, 23, 24, 43, 44, 2]]),
|
||||
"attention_mask": torch.LongTensor([[0, 1, 1, 1, 1, 0]]),
|
||||
},
|
||||
]
|
||||
|
||||
self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"])
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
_pad(items, "input_ids", 10, "right"),
|
||||
torch.LongTensor([[1, 23, 24, 2, 10, 10], [1, 23, 24, 43, 44, 2]]),
|
||||
)
|
||||
)
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
_pad(items, "input_ids", 10, "left"),
|
||||
torch.LongTensor([[10, 10, 1, 23, 24, 2], [1, 23, 24, 43, 44, 2]]),
|
||||
)
|
||||
)
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
_pad(items, "attention_mask", 0, "right"), torch.LongTensor([[0, 1, 1, 0, 0, 0], [0, 1, 1, 1, 1, 0]])
|
||||
)
|
||||
)
|
||||
|
||||
@require_torch
|
||||
def test_pipeline_image_padding(self):
|
||||
import torch
|
||||
|
||||
items = [
|
||||
{
|
||||
"label": "label1",
|
||||
"pixel_values": torch.zeros((1, 3, 10, 10)),
|
||||
},
|
||||
{
|
||||
"label": "label2",
|
||||
"pixel_values": torch.zeros((1, 3, 10, 10)),
|
||||
},
|
||||
]
|
||||
|
||||
self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"])
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
_pad(items, "pixel_values", 10, "right"),
|
||||
torch.zeros((2, 3, 10, 10)),
|
||||
)
|
||||
)
|
||||
|
||||
@require_torch
|
||||
def test_pipeline_offset_mapping(self):
|
||||
import torch
|
||||
|
||||
items = [
|
||||
{
|
||||
"offset_mappings": torch.zeros([1, 11, 2], dtype=torch.long),
|
||||
},
|
||||
{
|
||||
"offset_mappings": torch.zeros([1, 4, 2], dtype=torch.long),
|
||||
},
|
||||
]
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
_pad(items, "offset_mappings", 0, "right"),
|
||||
torch.zeros((2, 11, 2), dtype=torch.long),
|
||||
),
|
||||
)
|
||||
|
||||
@@ -54,8 +54,11 @@ class ConversationalPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM
|
||||
else []
|
||||
)
|
||||
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
|
||||
return conversation_agent, [Conversation("Hi there!")]
|
||||
|
||||
def run_pipeline_test(self, conversation_agent, _):
|
||||
# Simple
|
||||
outputs = conversation_agent(Conversation("Hi there!"))
|
||||
self.assertEqual(outputs, Conversation(past_user_inputs=["Hi there!"], generated_responses=[ANY(str)]))
|
||||
|
||||
@@ -14,7 +14,15 @@
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers import MODEL_MAPPING, TF_MODEL_MAPPING, CLIPConfig, FeatureExtractionPipeline, LxmertConfig, pipeline
|
||||
from transformers import (
|
||||
MODEL_MAPPING,
|
||||
TF_MODEL_MAPPING,
|
||||
CLIPConfig,
|
||||
FeatureExtractionPipeline,
|
||||
LxmertConfig,
|
||||
Wav2Vec2Config,
|
||||
pipeline,
|
||||
)
|
||||
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch
|
||||
|
||||
from .test_pipelines_common import PipelineTestCaseMeta
|
||||
@@ -61,12 +69,12 @@ class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
|
||||
raise ValueError("We expect lists of floats, nothing else")
|
||||
return shape
|
||||
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
if tokenizer is None:
|
||||
self.skipTest("No tokenizer")
|
||||
return
|
||||
|
||||
elif isinstance(model.config, (LxmertConfig, CLIPConfig)):
|
||||
elif isinstance(model.config, (LxmertConfig, CLIPConfig, Wav2Vec2Config)):
|
||||
self.skipTest(
|
||||
"This is an Lxmert bimodal model, we need to find a more consistent way to switch on those models."
|
||||
)
|
||||
@@ -81,11 +89,12 @@ class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
|
||||
)
|
||||
|
||||
return
|
||||
|
||||
feature_extractor = FeatureExtractionPipeline(
|
||||
model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
|
||||
)
|
||||
return feature_extractor, ["This is a test", "This is another test"]
|
||||
|
||||
def run_pipeline_test(self, feature_extractor, examples):
|
||||
outputs = feature_extractor("This is a test")
|
||||
|
||||
shape = self.get_shape(outputs)
|
||||
|
||||
@@ -159,22 +159,32 @@ class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="pt")
|
||||
unmasker.tokenizer.pad_token_id = None
|
||||
unmasker.tokenizer.pad_token = None
|
||||
self.run_pipeline_test(unmasker.model, unmasker.tokenizer, None)
|
||||
self.run_pipeline_test(unmasker, [])
|
||||
|
||||
@require_tf
|
||||
def test_model_no_pad_tf(self):
|
||||
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="tf")
|
||||
unmasker.tokenizer.pad_token_id = None
|
||||
unmasker.tokenizer.pad_token = None
|
||||
self.run_pipeline_test(unmasker.model, unmasker.tokenizer, None)
|
||||
self.run_pipeline_test(unmasker, [])
|
||||
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
if tokenizer is None or tokenizer.mask_token_id is None:
|
||||
self.skipTest("The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
|
||||
|
||||
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
|
||||
examples = [
|
||||
f"This is another {tokenizer.mask_token} test",
|
||||
]
|
||||
return fill_masker, examples
|
||||
|
||||
outputs = fill_masker(f"This is a {tokenizer.mask_token}")
|
||||
def run_pipeline_test(self, fill_masker, examples):
|
||||
tokenizer = fill_masker.tokenizer
|
||||
model = fill_masker.model
|
||||
|
||||
outputs = fill_masker(
|
||||
f"This is a {tokenizer.mask_token}",
|
||||
)
|
||||
self.assertEqual(
|
||||
outputs,
|
||||
[
|
||||
|
||||
@@ -44,9 +44,17 @@ else:
|
||||
class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||
model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
|
||||
|
||||
@require_datasets
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
|
||||
image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor)
|
||||
examples = [
|
||||
Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
|
||||
"http://images.cocodataset.org/val2017/000000039769.jpg",
|
||||
]
|
||||
return image_classifier, examples
|
||||
|
||||
@require_datasets
|
||||
def run_pipeline_test(self, image_classifier, examples):
|
||||
outputs = image_classifier("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
|
||||
self.assertEqual(
|
||||
|
||||
@@ -53,9 +53,12 @@ else:
|
||||
class ObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||
model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
|
||||
|
||||
@require_datasets
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor)
|
||||
return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
|
||||
|
||||
@require_datasets
|
||||
def run_pipeline_test(self, object_detector, examples):
|
||||
outputs = object_detector("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
|
||||
|
||||
self.assertGreater(len(outputs), 0)
|
||||
|
||||
@@ -32,13 +32,20 @@ class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||
model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING
|
||||
tf_model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
|
||||
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
if isinstance(model.config, LxmertConfig):
|
||||
# This is an bimodal model, we need to find a more consistent way
|
||||
# to switch on those models.
|
||||
return
|
||||
return None, None
|
||||
question_answerer = QuestionAnsweringPipeline(model, tokenizer)
|
||||
|
||||
examples = [
|
||||
{"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
|
||||
{"question": "In what field is HuggingFace ?", "context": "HuggingFace is an AI startup."},
|
||||
]
|
||||
return question_answerer, examples
|
||||
|
||||
def run_pipeline_test(self, question_answerer, _):
|
||||
outputs = question_answerer(
|
||||
question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
|
||||
)
|
||||
|
||||
@@ -36,8 +36,12 @@ class SummarizationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMe
|
||||
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)
|
||||
return summarizer, ["(CNN)The Palestinian Authority officially became", "Some other text"]
|
||||
|
||||
def run_pipeline_test(self, summarizer, _):
|
||||
model = summarizer.model
|
||||
|
||||
outputs = summarizer("(CNN)The Palestinian Authority officially became")
|
||||
self.assertEqual(outputs, [{"summary_text": ANY(str)}])
|
||||
|
||||
@@ -30,9 +30,11 @@ class Text2TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTest
|
||||
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)
|
||||
return generator, ["Something to write", "Something else"]
|
||||
|
||||
def run_pipeline_test(self, generator, _):
|
||||
outputs = generator("Something there")
|
||||
self.assertEqual(outputs, [{"generated_text": ANY(str)}])
|
||||
# These are encoder decoder, they don't just append to incoming string
|
||||
|
||||
@@ -72,9 +72,12 @@ class TextClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestC
|
||||
outputs = text_classifier("Birds are a type of animal")
|
||||
self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
|
||||
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
|
||||
return text_classifier, ["HuggingFace is in", "This is another test"]
|
||||
|
||||
def run_pipeline_test(self, text_classifier, _):
|
||||
model = text_classifier.model
|
||||
# Small inputs because BartTokenizer tiny has maximum position embeddings = 22
|
||||
valid_inputs = "HuggingFace is in"
|
||||
outputs = text_classifier(valid_inputs)
|
||||
|
||||
@@ -88,8 +88,14 @@ class TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM
|
||||
],
|
||||
)
|
||||
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
|
||||
return text_generator, ["This is a test", "Another test"]
|
||||
|
||||
def run_pipeline_test(self, text_generator, _):
|
||||
model = text_generator.model
|
||||
tokenizer = text_generator.tokenizer
|
||||
|
||||
outputs = text_generator("This is a test")
|
||||
self.assertEqual(outputs, [{"generated_text": ANY(str)}])
|
||||
self.assertTrue(outputs[0]["generated_text"].startswith("This is a test"))
|
||||
|
||||
@@ -45,8 +45,13 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
|
||||
model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
|
||||
tf_model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
|
||||
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
|
||||
return token_classifier, ["A simple string", "A simple string that is quite a bit longer"]
|
||||
|
||||
def run_pipeline_test(self, token_classifier, _):
|
||||
model = token_classifier.model
|
||||
tokenizer = token_classifier.tokenizer
|
||||
|
||||
outputs = token_classifier("A simple string")
|
||||
self.assertIsInstance(outputs, list)
|
||||
|
||||
@@ -20,6 +20,7 @@ from transformers import (
|
||||
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||
MBart50TokenizerFast,
|
||||
MBartConfig,
|
||||
MBartForConditionalGeneration,
|
||||
TranslationPipeline,
|
||||
pipeline,
|
||||
@@ -34,14 +35,16 @@ class TranslationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta
|
||||
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
if isinstance(model.config, MBartConfig):
|
||||
src_lang, tgt_lang = list(tokenizer.lang_code_to_id.keys())[:2]
|
||||
translator = TranslationPipeline(model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang)
|
||||
else:
|
||||
translator = TranslationPipeline(model=model, tokenizer=tokenizer)
|
||||
try:
|
||||
return translator, ["Some string", "Some other text"]
|
||||
|
||||
def run_pipeline_test(self, translator, _):
|
||||
outputs = translator("Some string")
|
||||
except ValueError:
|
||||
# Triggered by m2m langages
|
||||
src_lang, tgt_lang = list(translator.tokenizer.lang_code_to_id.keys())[:2]
|
||||
outputs = translator("Some string", src_lang=src_lang, tgt_lang=tgt_lang)
|
||||
self.assertEqual(outputs, [{"translation_text": ANY(str)}])
|
||||
|
||||
@require_torch
|
||||
|
||||
@@ -31,9 +31,13 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase, metaclass=PipelineT
|
||||
model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
|
||||
tf_model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
|
||||
|
||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
||||
classifier = ZeroShotClassificationPipeline(model=model, tokenizer=tokenizer)
|
||||
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||
classifier = ZeroShotClassificationPipeline(
|
||||
model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"]
|
||||
)
|
||||
return classifier, ["Who are you voting for in 2020?", "My stomach hurts."]
|
||||
|
||||
def run_pipeline_test(self, classifier, _):
|
||||
outputs = classifier("Who are you voting for in 2020?", candidate_labels="politics")
|
||||
self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user