Adding batch_size support for (almost) all pipelines (#13724)
* Tentative enabling of `batch_size` for pipelines. * Add systematic test for pipeline batching. * Enabling batch_size on almost all pipelines - Not `zero-shot` (it's already passing stuff as batched so trickier) - Not `QA` (preprocess uses squad features, we need to switch to real tensors at this boundary. * Adding `min_length_for_response` for conversational. * Making CTC, speech mappings avaiable regardless of framework. * Attempt at fixing automatic tests (ffmpeg not enabled for fast tests) * Removing ffmpeg dependency in tests. * Small fixes. * Slight cleanup. * Adding docs and adressing comments. * Quality. * Update docs/source/main_classes/pipelines.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/question_answering.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/zero_shot_classification.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Improving docs. * Update docs/source/main_classes/pipelines.rst Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com> * N -> oberved_batch_size softmax trick. * Follow `padding_side`. * Supporting image pipeline batching (and padding). * Rename `unbatch` -> `loader_batch`. * unbatch_size forgot. * Custom padding for offset mappings. * Attempt to remove librosa. * Adding require_audio. * torchaudio. * Back to using datasets librosa. * Adding help to set a pad_token on the tokenizer. * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Quality. Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
This commit is contained in:
@@ -71,6 +71,11 @@ GPU. If it doesn't don't hesitate to create an issue.
|
|||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
|
import datasets
|
||||||
|
from transformers import pipeline
|
||||||
|
from transformers.pipelines.base import KeyDataset
|
||||||
|
import tqdm
|
||||||
|
|
||||||
pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
|
pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
|
||||||
dataset = datasets.load_dataset("superb", name="asr", split="test")
|
dataset = datasets.load_dataset("superb", name="asr", split="test")
|
||||||
|
|
||||||
@@ -85,6 +90,144 @@ GPU. If it doesn't don't hesitate to create an issue.
|
|||||||
|
|
||||||
.. autofunction:: transformers.pipeline
|
.. autofunction:: transformers.pipeline
|
||||||
|
|
||||||
|
Pipeline batching
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
All pipelines (except `zero-shot-classification` and `question-answering` currently) can use batching. This will work
|
||||||
|
whenever the pipeline uses its streaming ability (so when passing lists or :obj:`Dataset`).
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
from transformers import pipeline
|
||||||
|
from transformers.pipelines.base import KeyDataset
|
||||||
|
import datasets
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
|
||||||
|
pipe = pipeline("text-classification", device=0)
|
||||||
|
for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
|
||||||
|
print(out)
|
||||||
|
# [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
|
||||||
|
# Exactly the same output as before, but the content are passed
|
||||||
|
# as batches to the model
|
||||||
|
|
||||||
|
|
||||||
|
.. warning::
|
||||||
|
|
||||||
|
However, this is not automatically a win for performance. It can be either a 10x speedup or 5x slowdown depending
|
||||||
|
on hardware, data and the actual model being used.
|
||||||
|
|
||||||
|
Example where it's most a speedup:
|
||||||
|
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
from transformers import pipeline
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
pipe = pipeline("text-classification", device=0)
|
||||||
|
|
||||||
|
|
||||||
|
class MyDataset(Dataset):
|
||||||
|
def __len__(self):
|
||||||
|
return 5000
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
return "This is a test"
|
||||||
|
|
||||||
|
|
||||||
|
dataset = MyDataset()
|
||||||
|
|
||||||
|
for batch_size in [1, 8, 64, 256]:
|
||||||
|
print("-" * 30)
|
||||||
|
print(f"Streaming batch_size={batch_size}")
|
||||||
|
for out in tqdm.tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
# On GTX 970
|
||||||
|
------------------------------
|
||||||
|
Streaming no batching
|
||||||
|
100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s]
|
||||||
|
------------------------------
|
||||||
|
Streaming batch_size=8
|
||||||
|
100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s]
|
||||||
|
------------------------------
|
||||||
|
Streaming batch_size=64
|
||||||
|
100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s]
|
||||||
|
------------------------------
|
||||||
|
Streaming batch_size=256
|
||||||
|
100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s]
|
||||||
|
(diminishing returns, saturated the GPU)
|
||||||
|
|
||||||
|
|
||||||
|
Example where it's most a slowdown:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
class MyDataset(Dataset):
|
||||||
|
def __len__(self):
|
||||||
|
return 5000
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
if i % 64 == 0:
|
||||||
|
n = 100
|
||||||
|
else:
|
||||||
|
n = 1
|
||||||
|
return "This is a test" * n
|
||||||
|
|
||||||
|
This is a occasional very long sentence compared to the other. In that case, the **whole** batch will need to be 400
|
||||||
|
tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on
|
||||||
|
bigger batches, the program simply crashes.
|
||||||
|
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
------------------------------
|
||||||
|
Streaming no batching
|
||||||
|
100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
|
||||||
|
------------------------------
|
||||||
|
Streaming batch_size=8
|
||||||
|
100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s]
|
||||||
|
------------------------------
|
||||||
|
Streaming batch_size=64
|
||||||
|
100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s]
|
||||||
|
------------------------------
|
||||||
|
Streaming batch_size=256
|
||||||
|
0%| | 0/1000 [00:00<?, ?it/s]
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/nicolas/src/transformers/test.py", line 42, in <module>
|
||||||
|
for out in tqdm.tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
|
||||||
|
....
|
||||||
|
q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head)
|
||||||
|
RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)
|
||||||
|
|
||||||
|
|
||||||
|
There are no good (general) solutions for this problem, and your mileage may vary depending on your use cases. Rule of
|
||||||
|
thumb:
|
||||||
|
|
||||||
|
For users, a rule of thumb is:
|
||||||
|
|
||||||
|
- **Measure performance on your load, with your hardware. Measure, measure, and keep measuring. Real numbers are the
|
||||||
|
only way to go.**
|
||||||
|
- If you are latency constrained (live product doing inference), don't batch
|
||||||
|
- If you are using CPU, don't batch.
|
||||||
|
- If you are using throughput (you want to run your model on a bunch of static data), on GPU, then:
|
||||||
|
|
||||||
|
- If you have no clue about the size of the sequence_length ("natural" data), by default don't batch, measure and
|
||||||
|
try tentatively to add it, add OOM checks to recover when it will fail (and it will at some point if you don't
|
||||||
|
control the sequence_length.)
|
||||||
|
- If your sequence_length is super regular, then batching is more likely to be VERY interesting, measure and push
|
||||||
|
it until you get OOMs.
|
||||||
|
- The larger the GPU the more likely batching is going to be more interesting
|
||||||
|
- As soon as you enable batching, make sure you can handle OOMs nicely.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Implementing a pipeline
|
Implementing a pipeline
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
|||||||
@@ -584,6 +584,7 @@ if is_torch_available():
|
|||||||
[
|
[
|
||||||
"MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
|
"MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
|
||||||
"MODEL_FOR_CAUSAL_LM_MAPPING",
|
"MODEL_FOR_CAUSAL_LM_MAPPING",
|
||||||
|
"MODEL_FOR_CTC_MAPPING",
|
||||||
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
|
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
|
||||||
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
|
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
|
||||||
"MODEL_FOR_MASKED_LM_MAPPING",
|
"MODEL_FOR_MASKED_LM_MAPPING",
|
||||||
@@ -594,6 +595,7 @@ if is_torch_available():
|
|||||||
"MODEL_FOR_QUESTION_ANSWERING_MAPPING",
|
"MODEL_FOR_QUESTION_ANSWERING_MAPPING",
|
||||||
"MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
|
"MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
|
||||||
"MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
|
"MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
|
||||||
|
"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
|
||||||
"MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
|
"MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
|
||||||
"MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
|
"MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
|
||||||
"MODEL_MAPPING",
|
"MODEL_MAPPING",
|
||||||
@@ -2430,6 +2432,7 @@ if TYPE_CHECKING:
|
|||||||
from .models.auto import (
|
from .models.auto import (
|
||||||
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
|
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
|
||||||
MODEL_FOR_CAUSAL_LM_MAPPING,
|
MODEL_FOR_CAUSAL_LM_MAPPING,
|
||||||
|
MODEL_FOR_CTC_MAPPING,
|
||||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
||||||
MODEL_FOR_MASKED_LM_MAPPING,
|
MODEL_FOR_MASKED_LM_MAPPING,
|
||||||
@@ -2440,6 +2443,7 @@ if TYPE_CHECKING:
|
|||||||
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
|
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
|
||||||
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||||
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
|
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
|
||||||
|
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
||||||
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
|
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
|
||||||
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
||||||
MODEL_MAPPING,
|
MODEL_MAPPING,
|
||||||
|
|||||||
@@ -169,6 +169,10 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
|
|||||||
elif model_class in MODEL_FOR_CTC_MAPPING.values():
|
elif model_class in MODEL_FOR_CTC_MAPPING.values():
|
||||||
outputs = self.model(**model_inputs)
|
outputs = self.model(**model_inputs)
|
||||||
tokens = outputs.logits.squeeze(0).argmax(dim=-1)
|
tokens = outputs.logits.squeeze(0).argmax(dim=-1)
|
||||||
|
else:
|
||||||
|
logger.warning("This is an unknown class, treating it as CTC.")
|
||||||
|
outputs = self.model(**model_inputs)
|
||||||
|
tokens = outputs.logits.squeeze(0).argmax(dim=-1)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def postprocess(self, model_outputs):
|
def postprocess(self, model_outputs):
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ from contextlib import contextmanager
|
|||||||
from os.path import abspath, exists
|
from os.path import abspath, exists
|
||||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from packaging import version
|
from packaging import version
|
||||||
|
|
||||||
from ..feature_extraction_utils import PreTrainedFeatureExtractor
|
from ..feature_extraction_utils import PreTrainedFeatureExtractor
|
||||||
@@ -59,12 +60,80 @@ if TYPE_CHECKING:
|
|||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def collate_fn(items):
|
def no_collate_fn(items):
|
||||||
if len(items) != 1:
|
if len(items) != 1:
|
||||||
raise ValueError("This collate_fn is meant to be used with batch_size=1")
|
raise ValueError("This collate_fn is meant to be used with batch_size=1")
|
||||||
return items[0]
|
return items[0]
|
||||||
|
|
||||||
|
|
||||||
|
def _pad(items, key, padding_value, padding_side):
|
||||||
|
batch_size = len(items)
|
||||||
|
if isinstance(items[0][key], torch.Tensor):
|
||||||
|
# Others include `attention_mask` etc...
|
||||||
|
shape = items[0][key].shape
|
||||||
|
dim = len(shape)
|
||||||
|
if dim == 4:
|
||||||
|
# This is probable image so padding shouldn't be necessary
|
||||||
|
# B, C, H, W
|
||||||
|
return torch.cat([item[key] for item in items], dim=0)
|
||||||
|
max_length = max(item[key].shape[1] for item in items)
|
||||||
|
dtype = items[0][key].dtype
|
||||||
|
|
||||||
|
if dim == 2:
|
||||||
|
tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
|
||||||
|
elif dim == 3:
|
||||||
|
tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
|
||||||
|
|
||||||
|
for i, item in enumerate(items):
|
||||||
|
if dim == 2:
|
||||||
|
if padding_side == "left":
|
||||||
|
tensor[i, -len(item[key][0]) :] = item[key][0].clone()
|
||||||
|
else:
|
||||||
|
tensor[i, : len(item[key][0])] = item[key][0].clone()
|
||||||
|
elif dim == 3:
|
||||||
|
if padding_side == "left":
|
||||||
|
tensor[i, -len(item[key][0]) :, :] = item[key][0].clone()
|
||||||
|
else:
|
||||||
|
tensor[i, : len(item[key][0]), :] = item[key][0].clone()
|
||||||
|
return tensor
|
||||||
|
else:
|
||||||
|
return [item[key] for item in items]
|
||||||
|
|
||||||
|
|
||||||
|
def pad_collate_fn(tokenizer, feature_extractor):
|
||||||
|
padding_side = "right"
|
||||||
|
if tokenizer is None and feature_extractor is None:
|
||||||
|
raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching")
|
||||||
|
if tokenizer is not None:
|
||||||
|
if tokenizer.pad_token_id is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Pipeline with tokenizer without pad_token cannot do batching. You can try to set it with "
|
||||||
|
"`pipe.tokenizer.pad_token_id = model.config.eos_token_id`."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
padding_value = tokenizer.pad_token_id
|
||||||
|
padding_side = tokenizer.padding_side
|
||||||
|
if feature_extractor is not None:
|
||||||
|
# Feature extractor can be images, where no padding is expected
|
||||||
|
padding_value = getattr(feature_extractor, "padding_value", None)
|
||||||
|
padding_side = getattr(feature_extractor, "padding_side", None)
|
||||||
|
|
||||||
|
def inner(items):
|
||||||
|
keys = set(items[0].keys())
|
||||||
|
for item in items:
|
||||||
|
if set(item.keys()) != keys:
|
||||||
|
raise ValueError(
|
||||||
|
f"The elements of the batch contain different keys. Cannot batch them ({set(item.keys())} != {keys})"
|
||||||
|
)
|
||||||
|
# input_values, input_pixels, input_ids, ...
|
||||||
|
padded = {
|
||||||
|
key: _pad(items, key, padding_value if key.startswith("input_") else 0, padding_side) for key in keys
|
||||||
|
}
|
||||||
|
return padded
|
||||||
|
|
||||||
|
return inner
|
||||||
|
|
||||||
|
|
||||||
def infer_framework_load_model(
|
def infer_framework_load_model(
|
||||||
model,
|
model,
|
||||||
config: AutoConfig,
|
config: AutoConfig,
|
||||||
@@ -591,6 +660,13 @@ PIPELINE_INIT_ARGS = r"""
|
|||||||
is provided.
|
is provided.
|
||||||
task (:obj:`str`, defaults to :obj:`""`):
|
task (:obj:`str`, defaults to :obj:`""`):
|
||||||
A task-identifier for the pipeline.
|
A task-identifier for the pipeline.
|
||||||
|
num_workers (:obj:`int`, `optional`, defaults to 8):
|
||||||
|
When the pipeline will use `DataLoader` (when passing a dataset, on GPU for a Pytorch model), the number of
|
||||||
|
workers to be used.
|
||||||
|
batch_size (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
When the pipeline will use `DataLoader` (when passing a dataset, on GPU for a Pytorch model), the size of
|
||||||
|
the batch to use, for inference this is not always beneficial, please read `Batching with pipelines
|
||||||
|
<https://huggingface.co/transformers/main_classes/pipelines.html#pipeline-batching>`_ .
|
||||||
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
|
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
|
||||||
Reference to the object in charge of parsing supplied pipeline parameters.
|
Reference to the object in charge of parsing supplied pipeline parameters.
|
||||||
device (:obj:`int`, `optional`, defaults to -1):
|
device (:obj:`int`, `optional`, defaults to -1):
|
||||||
@@ -617,10 +693,44 @@ if is_torch_available():
|
|||||||
return processed
|
return processed
|
||||||
|
|
||||||
class PipelineIterator(IterableDataset):
|
class PipelineIterator(IterableDataset):
|
||||||
def __init__(self, loader, infer, params):
|
def __init__(self, loader, infer, params, loader_batch_size=None):
|
||||||
|
"""
|
||||||
|
Roughly equivalent to
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
for item in loader:
|
||||||
|
yield infer(item, **params)
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
loader (:obj:`torch.utils.data.DataLoader` or any iterator):
|
||||||
|
The iterator that will be used to apply :obj:`infer` on.
|
||||||
|
infer (any function):
|
||||||
|
The function to apply of each element of :obj:`loader`.
|
||||||
|
params (:obj:`dict`):
|
||||||
|
The parameters passed to :obj:`infer` along with every item
|
||||||
|
loader_batch_size (:obj:`int`, `optional`):
|
||||||
|
If specified, the items of :obj:`loader` are supposed to come as batch, and are loader_batched here
|
||||||
|
making it roughly behave as
|
||||||
|
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
for items in loader:
|
||||||
|
for i in loader_batch_size:
|
||||||
|
item = items[i]
|
||||||
|
yield infer(item, **params)
|
||||||
|
"""
|
||||||
self.loader = loader
|
self.loader = loader
|
||||||
self.infer = infer
|
self.infer = infer
|
||||||
self.params = params
|
self.params = params
|
||||||
|
if loader_batch_size == 1:
|
||||||
|
# Let's spare some time by deactivating altogether
|
||||||
|
loader_batch_size = None
|
||||||
|
self.loader_batch_size = loader_batch_size
|
||||||
|
|
||||||
|
# Internal bookkeeping
|
||||||
|
self._loader_batch_index = None
|
||||||
|
self._loader_batch_data = None
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.loader)
|
return len(self.loader)
|
||||||
@@ -629,9 +739,48 @@ if is_torch_available():
|
|||||||
self.iterator = iter(self.loader)
|
self.iterator = iter(self.loader)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def loader_batch_item(self):
|
||||||
|
if isinstance(self._loader_batch_data, torch.Tensor):
|
||||||
|
result = self._loader_batch_data[self._loader_batch_index]
|
||||||
|
else:
|
||||||
|
loader_batched = {}
|
||||||
|
for k, element in self._loader_batch_data.items():
|
||||||
|
if k == "past_key_values":
|
||||||
|
continue
|
||||||
|
if isinstance(element[self._loader_batch_index], torch.Tensor):
|
||||||
|
loader_batched[k] = element[self._loader_batch_index].unsqueeze(0)
|
||||||
|
elif isinstance(element[self._loader_batch_index], np.ndarray):
|
||||||
|
loader_batched[k] = np.expand_dims(element[self._loader_batch_index], 0)
|
||||||
|
else:
|
||||||
|
loader_batched[k] = element[self._loader_batch_index]
|
||||||
|
result = self._loader_batch_data.__class__(loader_batched)
|
||||||
|
self._loader_batch_index += 1
|
||||||
|
return result
|
||||||
|
|
||||||
def __next__(self):
|
def __next__(self):
|
||||||
|
if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
|
||||||
|
return self.loader_batch_item()
|
||||||
|
|
||||||
item = next(self.iterator)
|
item = next(self.iterator)
|
||||||
processed = self.infer(item, **self.params)
|
processed = self.infer(item, **self.params)
|
||||||
|
if self.loader_batch_size is not None:
|
||||||
|
if isinstance(processed, torch.Tensor):
|
||||||
|
first_tensor = processed
|
||||||
|
else:
|
||||||
|
key = list(processed.keys())[0]
|
||||||
|
first_tensor = processed[key]
|
||||||
|
if isinstance(first_tensor, list):
|
||||||
|
observed_batch_size = len(first_tensor)
|
||||||
|
else:
|
||||||
|
observed_batch_size = first_tensor.shape[0]
|
||||||
|
if 0 < observed_batch_size < self.loader_batch_size:
|
||||||
|
# Could be last batch so we can't unroll as many
|
||||||
|
# elements.
|
||||||
|
self.loader_batch_size = observed_batch_size
|
||||||
|
self._loader_batch_data = processed
|
||||||
|
self._loader_batch_index = 0
|
||||||
|
return self.loader_batch_item()
|
||||||
|
else:
|
||||||
return processed
|
return processed
|
||||||
|
|
||||||
class KeyDataset(Dataset):
|
class KeyDataset(Dataset):
|
||||||
@@ -881,17 +1030,20 @@ class Pipeline(_ScikitCompat):
|
|||||||
raise ValueError(f"Framework {self.framework} is not supported")
|
raise ValueError(f"Framework {self.framework} is not supported")
|
||||||
return model_outputs
|
return model_outputs
|
||||||
|
|
||||||
def get_iterator(self, inputs, num_workers: int, preprocess_params, forward_params, postprocess_params):
|
def get_iterator(
|
||||||
|
self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
|
||||||
|
):
|
||||||
if "TOKENIZERS_PARALLELISM" not in os.environ:
|
if "TOKENIZERS_PARALLELISM" not in os.environ:
|
||||||
logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
|
logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
dataset = PipelineDataset(inputs, self.preprocess, preprocess_params)
|
dataset = PipelineDataset(inputs, self.preprocess, preprocess_params)
|
||||||
dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=1, collate_fn=collate_fn)
|
collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, self.feature_extractor)
|
||||||
model_iterator = PipelineIterator(dataloader, self.forward, forward_params)
|
dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
|
||||||
|
model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
|
||||||
final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
|
final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
|
||||||
return final_iterator
|
return final_iterator
|
||||||
|
|
||||||
def __call__(self, inputs, *args, num_workers=8, **kwargs):
|
def __call__(self, inputs, *args, num_workers=0, batch_size=1, **kwargs):
|
||||||
if args:
|
if args:
|
||||||
logger.warning(f"Ignoring args : {args}")
|
logger.warning(f"Ignoring args : {args}")
|
||||||
preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs)
|
preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs)
|
||||||
@@ -910,14 +1062,16 @@ class Pipeline(_ScikitCompat):
|
|||||||
if isinstance(inputs, list):
|
if isinstance(inputs, list):
|
||||||
if self.framework == "pt":
|
if self.framework == "pt":
|
||||||
final_iterator = self.get_iterator(
|
final_iterator = self.get_iterator(
|
||||||
inputs, num_workers, preprocess_params, forward_params, postprocess_params
|
inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
|
||||||
)
|
)
|
||||||
outputs = [output for output in final_iterator]
|
outputs = [output for output in final_iterator]
|
||||||
return outputs
|
return outputs
|
||||||
else:
|
else:
|
||||||
return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
|
return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
|
||||||
elif Dataset is not None and isinstance(inputs, Dataset):
|
elif Dataset is not None and isinstance(inputs, Dataset):
|
||||||
return self.get_iterator(inputs, num_workers, preprocess_params, forward_params, postprocess_params)
|
return self.get_iterator(
|
||||||
|
inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
|
return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
|
||||||
|
|
||||||
|
|||||||
@@ -243,7 +243,7 @@ class ConversationalPipeline(Pipeline):
|
|||||||
return outputs[0]
|
return outputs[0]
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def preprocess(self, conversation: Conversation) -> Dict[str, Any]:
|
def preprocess(self, conversation: Conversation, min_length_for_response=32) -> Dict[str, Any]:
|
||||||
if not isinstance(conversation, Conversation):
|
if not isinstance(conversation, Conversation):
|
||||||
raise ValueError("ConversationalPipeline, expects Conversation as inputs")
|
raise ValueError("ConversationalPipeline, expects Conversation as inputs")
|
||||||
if conversation.new_user_input is None:
|
if conversation.new_user_input is None:
|
||||||
@@ -274,18 +274,18 @@ class ConversationalPipeline(Pipeline):
|
|||||||
if "attention_mask" in model_inputs:
|
if "attention_mask" in model_inputs:
|
||||||
model_inputs["attention_mask"] = model_inputs["attention_mask"][:, -trim:]
|
model_inputs["attention_mask"] = model_inputs["attention_mask"][:, -trim:]
|
||||||
conversation = model_inputs.pop("conversation")
|
conversation = model_inputs.pop("conversation")
|
||||||
model_inputs["max_length"] = max_length
|
generate_kwargs["max_length"] = max_length
|
||||||
output_ids = self.model.generate(**model_inputs, **generate_kwargs)
|
output_ids = self.model.generate(**model_inputs, **generate_kwargs)
|
||||||
if self.model.config.is_encoder_decoder:
|
if self.model.config.is_encoder_decoder:
|
||||||
start_position = 1
|
start_position = 1
|
||||||
else:
|
else:
|
||||||
start_position = n
|
start_position = n
|
||||||
return {"output_ids": output_ids[0, start_position:], "conversation": conversation}
|
return {"output_ids": output_ids[:, start_position:], "conversation": conversation}
|
||||||
|
|
||||||
def postprocess(self, model_outputs, clean_up_tokenization_spaces=True):
|
def postprocess(self, model_outputs, clean_up_tokenization_spaces=True):
|
||||||
output_ids = model_outputs["output_ids"]
|
output_ids = model_outputs["output_ids"]
|
||||||
answer = self.tokenizer.decode(
|
answer = self.tokenizer.decode(
|
||||||
output_ids,
|
output_ids[0],
|
||||||
skip_special_tokens=True,
|
skip_special_tokens=True,
|
||||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -89,14 +89,14 @@ class FillMaskPipeline(Pipeline):
|
|||||||
|
|
||||||
def _forward(self, model_inputs):
|
def _forward(self, model_inputs):
|
||||||
model_outputs = self.model(**model_inputs)
|
model_outputs = self.model(**model_inputs)
|
||||||
model_outputs["input_ids"] = model_inputs["input_ids"][0]
|
model_outputs["input_ids"] = model_inputs["input_ids"]
|
||||||
return model_outputs
|
return model_outputs
|
||||||
|
|
||||||
def postprocess(self, model_outputs, top_k=5, target_ids=None):
|
def postprocess(self, model_outputs, top_k=5, target_ids=None):
|
||||||
# Cap top_k if there are targets
|
# Cap top_k if there are targets
|
||||||
if target_ids is not None and target_ids.shape[0] < top_k:
|
if target_ids is not None and target_ids.shape[0] < top_k:
|
||||||
top_k = target_ids.shape[0]
|
top_k = target_ids.shape[0]
|
||||||
input_ids = model_outputs["input_ids"]
|
input_ids = model_outputs["input_ids"][0]
|
||||||
outputs = model_outputs["logits"]
|
outputs = model_outputs["logits"]
|
||||||
result = []
|
result = []
|
||||||
|
|
||||||
|
|||||||
@@ -114,11 +114,12 @@ class ObjectDetectionPipeline(Pipeline):
|
|||||||
def _forward(self, model_inputs):
|
def _forward(self, model_inputs):
|
||||||
target_size = model_inputs.pop("target_size")
|
target_size = model_inputs.pop("target_size")
|
||||||
outputs = self.model(**model_inputs)
|
outputs = self.model(**model_inputs)
|
||||||
model_outputs = {"outputs": outputs, "target_size": target_size}
|
model_outputs = outputs.__class__({"target_size": target_size, **outputs})
|
||||||
return model_outputs
|
return model_outputs
|
||||||
|
|
||||||
def postprocess(self, model_outputs, threshold=0.9):
|
def postprocess(self, model_outputs, threshold=0.9):
|
||||||
raw_annotations = self.feature_extractor.post_process(model_outputs["outputs"], model_outputs["target_size"])
|
target_size = model_outputs["target_size"]
|
||||||
|
raw_annotations = self.feature_extractor.post_process(model_outputs, target_size)
|
||||||
raw_annotation = raw_annotations[0]
|
raw_annotation = raw_annotations[0]
|
||||||
keep = raw_annotation["scores"] > threshold
|
keep = raw_annotation["scores"] > threshold
|
||||||
scores = raw_annotation["scores"][keep]
|
scores = raw_annotation["scores"][keep]
|
||||||
|
|||||||
@@ -8,9 +8,12 @@ from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_featur
|
|||||||
from ..file_utils import PaddingStrategy, add_end_docstrings, is_tf_available, is_torch_available
|
from ..file_utils import PaddingStrategy, add_end_docstrings, is_tf_available, is_torch_available
|
||||||
from ..modelcard import ModelCard
|
from ..modelcard import ModelCard
|
||||||
from ..tokenization_utils import PreTrainedTokenizer
|
from ..tokenization_utils import PreTrainedTokenizer
|
||||||
|
from ..utils import logging
|
||||||
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
|
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from ..modeling_tf_utils import TFPreTrainedModel
|
from ..modeling_tf_utils import TFPreTrainedModel
|
||||||
from ..modeling_utils import PreTrainedModel
|
from ..modeling_utils import PreTrainedModel
|
||||||
@@ -241,6 +244,9 @@ class QuestionAnsweringPipeline(Pipeline):
|
|||||||
- **end** (:obj:`int`) -- The character end index of the answer (in the tokenized version of the input).
|
- **end** (:obj:`int`) -- The character end index of the answer (in the tokenized version of the input).
|
||||||
- **answer** (:obj:`str`) -- The answer to the question.
|
- **answer** (:obj:`str`) -- The answer to the question.
|
||||||
"""
|
"""
|
||||||
|
if kwargs.get("batch_size", 1) > 1:
|
||||||
|
logger.error("Batch_size > 1 is not supported for question answering pipeline, setting it to 1.")
|
||||||
|
kwargs["batch_size"] = 1
|
||||||
|
|
||||||
# Convert inputs to features
|
# Convert inputs to features
|
||||||
examples = self._args_parser(*args, **kwargs)
|
examples = self._args_parser(*args, **kwargs)
|
||||||
|
|||||||
@@ -204,12 +204,12 @@ class TokenClassificationPipeline(Pipeline):
|
|||||||
offset_mapping = model_inputs.pop("offset_mapping", None)
|
offset_mapping = model_inputs.pop("offset_mapping", None)
|
||||||
sentence = model_inputs.pop("sentence")
|
sentence = model_inputs.pop("sentence")
|
||||||
if self.framework == "tf":
|
if self.framework == "tf":
|
||||||
outputs = self.model(model_inputs.data)[0][0]
|
logits = self.model(model_inputs.data)[0]
|
||||||
else:
|
else:
|
||||||
outputs = self.model(**model_inputs)[0][0]
|
logits = self.model(**model_inputs)[0]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"outputs": outputs,
|
"logits": logits,
|
||||||
"special_tokens_mask": special_tokens_mask,
|
"special_tokens_mask": special_tokens_mask,
|
||||||
"offset_mapping": offset_mapping,
|
"offset_mapping": offset_mapping,
|
||||||
"sentence": sentence,
|
"sentence": sentence,
|
||||||
@@ -217,13 +217,16 @@ class TokenClassificationPipeline(Pipeline):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def postprocess(self, model_outputs, aggregation_strategy=AggregationStrategy.NONE):
|
def postprocess(self, model_outputs, aggregation_strategy=AggregationStrategy.NONE):
|
||||||
outputs = model_outputs["outputs"].numpy()
|
logits = model_outputs["logits"][0].numpy()
|
||||||
sentence = model_outputs["sentence"]
|
sentence = model_outputs["sentence"]
|
||||||
input_ids = model_outputs["input_ids"][0]
|
input_ids = model_outputs["input_ids"][0]
|
||||||
offset_mapping = model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
|
offset_mapping = model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
|
||||||
special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy()
|
special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy()
|
||||||
|
|
||||||
scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
|
maxes = np.max(logits, axis=-1, keepdims=True)
|
||||||
|
shifted_exp = np.exp(logits - maxes)
|
||||||
|
scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
|
||||||
|
|
||||||
pre_entities = self.gather_pre_entities(
|
pre_entities = self.gather_pre_entities(
|
||||||
sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
|
sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -183,6 +183,9 @@ class ZeroShotClassificationPipeline(Pipeline):
|
|||||||
- **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood.
|
- **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood.
|
||||||
- **scores** (:obj:`List[float]`) -- The probabilities for each of the labels.
|
- **scores** (:obj:`List[float]`) -- The probabilities for each of the labels.
|
||||||
"""
|
"""
|
||||||
|
if kwargs.get("batch_size", 1) > 1:
|
||||||
|
logger.error("Batch size > 1 is not supported for zero-shot pipeline, setting batch_size=1.")
|
||||||
|
kwargs["batch_size"] = 1
|
||||||
|
|
||||||
if len(args) == 0:
|
if len(args) == 0:
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -313,6 +313,9 @@ MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None
|
|||||||
MODEL_FOR_CAUSAL_LM_MAPPING = None
|
MODEL_FOR_CAUSAL_LM_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
|
MODEL_FOR_CTC_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
|
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
@@ -343,6 +346,9 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None
|
|||||||
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = None
|
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
|
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
|
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from transformers.testing_utils import (
|
|||||||
require_datasets,
|
require_datasets,
|
||||||
require_tf,
|
require_tf,
|
||||||
require_torch,
|
require_torch,
|
||||||
|
require_torchaudio,
|
||||||
slow,
|
slow,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -35,15 +36,16 @@ from .test_pipelines_common import ANY, PipelineTestCaseMeta
|
|||||||
class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||||
model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
|
model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
|
||||||
|
|
||||||
@require_datasets
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
@slow
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
|
||||||
import datasets
|
|
||||||
|
|
||||||
audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=feature_extractor)
|
audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=feature_extractor)
|
||||||
|
|
||||||
# test with a raw waveform
|
# test with a raw waveform
|
||||||
audio = np.zeros((34000,))
|
audio = np.zeros((34000,))
|
||||||
|
audio2 = np.zeros((14000,))
|
||||||
|
return audio_classifier, [audio2, audio]
|
||||||
|
|
||||||
|
def run_pipeline_test(self, audio_classifier, examples):
|
||||||
|
audio2, audio = examples
|
||||||
output = audio_classifier(audio)
|
output = audio_classifier(audio)
|
||||||
# by default a model is initialized with num_labels=2
|
# by default a model is initialized with num_labels=2
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@@ -61,10 +63,17 @@ class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.run_torchaudio(audio_classifier)
|
||||||
|
|
||||||
|
@require_datasets
|
||||||
|
@require_torchaudio
|
||||||
|
def run_torchaudio(self, audio_classifier):
|
||||||
|
import datasets
|
||||||
|
|
||||||
# test with a local file
|
# test with a local file
|
||||||
dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||||
filename = dataset[0]["file"]
|
audio = dataset[0]["audio"]["array"]
|
||||||
output = audio_classifier(filename)
|
output = audio_classifier(audio)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
output,
|
output,
|
||||||
[
|
[
|
||||||
|
|||||||
@@ -14,11 +14,28 @@
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from transformers import AutoFeatureExtractor, AutoTokenizer, Speech2TextForConditionalGeneration, Wav2Vec2ForCTC
|
from transformers import (
|
||||||
|
MODEL_FOR_CTC_MAPPING,
|
||||||
|
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
||||||
|
AutoFeatureExtractor,
|
||||||
|
AutoTokenizer,
|
||||||
|
Speech2TextForConditionalGeneration,
|
||||||
|
Wav2Vec2ForCTC,
|
||||||
|
)
|
||||||
from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
|
from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
|
||||||
from transformers.testing_utils import is_pipeline_test, require_datasets, require_torch, require_torchaudio, slow
|
from transformers.testing_utils import (
|
||||||
|
is_pipeline_test,
|
||||||
|
require_datasets,
|
||||||
|
require_tf,
|
||||||
|
require_torch,
|
||||||
|
require_torchaudio,
|
||||||
|
slow,
|
||||||
|
)
|
||||||
|
|
||||||
|
from .test_pipelines_common import ANY, PipelineTestCaseMeta
|
||||||
|
|
||||||
|
|
||||||
# We can't use this mixin because it assumes TF support.
|
# We can't use this mixin because it assumes TF support.
|
||||||
@@ -26,14 +43,42 @@ from transformers.testing_utils import is_pipeline_test, require_datasets, requi
|
|||||||
|
|
||||||
|
|
||||||
@is_pipeline_test
|
@is_pipeline_test
|
||||||
class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
|
class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||||
|
model_mapping = {
|
||||||
|
k: v
|
||||||
|
for k, v in (list(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING.items()) if MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING else [])
|
||||||
|
+ (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
|
if tokenizer is None:
|
||||||
|
# Side effect of no Fast Tokenizer class for these model, so skipping
|
||||||
|
# But the slow tokenizer test should still run as they're quite small
|
||||||
|
self.skipTest("No tokenizer available")
|
||||||
|
return
|
||||||
|
# return None, None
|
||||||
|
|
||||||
|
speech_recognizer = AutomaticSpeechRecognitionPipeline(
|
||||||
|
model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
|
||||||
|
)
|
||||||
|
|
||||||
|
# test with a raw waveform
|
||||||
|
audio = np.zeros((34000,))
|
||||||
|
audio2 = np.zeros((14000,))
|
||||||
|
return speech_recognizer, [audio, audio2]
|
||||||
|
|
||||||
|
def run_pipeline_test(self, speech_recognizer, examples):
|
||||||
|
audio = np.zeros((34000,))
|
||||||
|
outputs = speech_recognizer(audio)
|
||||||
|
self.assertEqual(outputs, {"text": ANY(str)})
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@slow
|
@slow
|
||||||
def test_pt_defaults(self):
|
def test_pt_defaults(self):
|
||||||
pipeline("automatic-speech-recognition", framework="pt")
|
pipeline("automatic-speech-recognition", framework="pt")
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
def test_torch_small(self):
|
def test_small_model_pt(self):
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
speech_recognizer = pipeline(
|
speech_recognizer = pipeline(
|
||||||
@@ -46,6 +91,10 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
|
|||||||
output = speech_recognizer(waveform)
|
output = speech_recognizer(waveform)
|
||||||
self.assertEqual(output, {"text": "(Applaudissements)"})
|
self.assertEqual(output, {"text": "(Applaudissements)"})
|
||||||
|
|
||||||
|
@require_tf
|
||||||
|
def test_small_model_tf(self):
|
||||||
|
self.skipTest("Tensorflow not supported yet.")
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
def test_torch_small_no_tokenizer_files(self):
|
def test_torch_small_no_tokenizer_files(self):
|
||||||
# test that model without tokenizer file cannot be loaded
|
# test that model without tokenizer file cannot be loaded
|
||||||
|
|||||||
@@ -12,8 +12,10 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import copy
|
||||||
import importlib
|
import importlib
|
||||||
import logging
|
import logging
|
||||||
|
import random
|
||||||
import string
|
import string
|
||||||
import unittest
|
import unittest
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
@@ -21,6 +23,7 @@ from functools import lru_cache
|
|||||||
from unittest import skipIf
|
from unittest import skipIf
|
||||||
|
|
||||||
from transformers import FEATURE_EXTRACTOR_MAPPING, TOKENIZER_MAPPING, AutoFeatureExtractor, AutoTokenizer, pipeline
|
from transformers import FEATURE_EXTRACTOR_MAPPING, TOKENIZER_MAPPING, AutoFeatureExtractor, AutoTokenizer, pipeline
|
||||||
|
from transformers.pipelines.base import _pad
|
||||||
from transformers.testing_utils import is_pipeline_test, require_torch
|
from transformers.testing_utils import is_pipeline_test, require_torch
|
||||||
|
|
||||||
|
|
||||||
@@ -73,6 +76,12 @@ def get_tiny_config_from_class(configuration_class):
|
|||||||
@lru_cache(maxsize=100)
|
@lru_cache(maxsize=100)
|
||||||
def get_tiny_tokenizer_from_checkpoint(checkpoint):
|
def get_tiny_tokenizer_from_checkpoint(checkpoint):
|
||||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||||
|
if tokenizer.vocab_size < 300:
|
||||||
|
# Wav2Vec2ForCTC for instance
|
||||||
|
# ByT5Tokenizer
|
||||||
|
# all are already small enough and have no Fast version that can
|
||||||
|
# be retrained
|
||||||
|
return tokenizer
|
||||||
logger.info("Training new from iterator ...")
|
logger.info("Training new from iterator ...")
|
||||||
vocabulary = string.ascii_letters + string.digits + " "
|
vocabulary = string.ascii_letters + string.digits + " "
|
||||||
tokenizer = tokenizer.train_new_from_iterator(vocabulary, vocab_size=len(vocabulary), show_progress=False)
|
tokenizer = tokenizer.train_new_from_iterator(vocabulary, vocab_size=len(vocabulary), show_progress=False)
|
||||||
@@ -87,6 +96,12 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config):
|
|||||||
feature_extractor = None
|
feature_extractor = None
|
||||||
if hasattr(tiny_config, "image_size") and feature_extractor:
|
if hasattr(tiny_config, "image_size") and feature_extractor:
|
||||||
feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
|
feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
|
||||||
|
|
||||||
|
# Speech2TextModel specific.
|
||||||
|
if hasattr(tiny_config, "input_feat_per_channel") and feature_extractor:
|
||||||
|
feature_extractor = feature_extractor.__class__(
|
||||||
|
feature_size=tiny_config.input_feat_per_channel, num_mel_bins=tiny_config.input_feat_per_channel
|
||||||
|
)
|
||||||
return feature_extractor
|
return feature_extractor
|
||||||
|
|
||||||
|
|
||||||
@@ -136,7 +151,26 @@ class PipelineTestCaseMeta(type):
|
|||||||
else:
|
else:
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
feature_extractor = get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config)
|
feature_extractor = get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config)
|
||||||
self.run_pipeline_test(model, tokenizer, feature_extractor)
|
pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor)
|
||||||
|
if pipeline is None:
|
||||||
|
# The test can disable itself, but it should be very marginal
|
||||||
|
# Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
|
||||||
|
return
|
||||||
|
self.run_pipeline_test(pipeline, examples)
|
||||||
|
|
||||||
|
def run_batch_test(pipeline, examples):
|
||||||
|
# Need to copy because `Conversation` are stateful
|
||||||
|
if pipeline.tokenizer is not None and pipeline.tokenizer.pad_token_id is None:
|
||||||
|
return # No batching for this and it's OK
|
||||||
|
|
||||||
|
# 10 examples with batch size 4 means there needs to be a unfinished batch
|
||||||
|
# which is important for the unbatcher
|
||||||
|
dataset = [copy.deepcopy(random.choice(examples)) for i in range(10)]
|
||||||
|
|
||||||
|
for item in pipeline(dataset, batch_size=4):
|
||||||
|
pass
|
||||||
|
|
||||||
|
run_batch_test(pipeline, examples)
|
||||||
|
|
||||||
return test
|
return test
|
||||||
|
|
||||||
@@ -211,3 +245,85 @@ class CommonPipelineTest(unittest.TestCase):
|
|||||||
dataset = MyDataset()
|
dataset = MyDataset()
|
||||||
for output in text_classifier(dataset):
|
for output in text_classifier(dataset):
|
||||||
self.assertEqual(output, {"label": ANY(str), "score": ANY(float)})
|
self.assertEqual(output, {"label": ANY(str), "score": ANY(float)})
|
||||||
|
|
||||||
|
|
||||||
|
@is_pipeline_test
|
||||||
|
class PipelinePadTest(unittest.TestCase):
|
||||||
|
@require_torch
|
||||||
|
def test_pipeline_padding(self):
|
||||||
|
import torch
|
||||||
|
|
||||||
|
items = [
|
||||||
|
{
|
||||||
|
"label": "label1",
|
||||||
|
"input_ids": torch.LongTensor([[1, 23, 24, 2]]),
|
||||||
|
"attention_mask": torch.LongTensor([[0, 1, 1, 0]]),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "label2",
|
||||||
|
"input_ids": torch.LongTensor([[1, 23, 24, 43, 44, 2]]),
|
||||||
|
"attention_mask": torch.LongTensor([[0, 1, 1, 1, 1, 0]]),
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"])
|
||||||
|
self.assertTrue(
|
||||||
|
torch.allclose(
|
||||||
|
_pad(items, "input_ids", 10, "right"),
|
||||||
|
torch.LongTensor([[1, 23, 24, 2, 10, 10], [1, 23, 24, 43, 44, 2]]),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
torch.allclose(
|
||||||
|
_pad(items, "input_ids", 10, "left"),
|
||||||
|
torch.LongTensor([[10, 10, 1, 23, 24, 2], [1, 23, 24, 43, 44, 2]]),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
torch.allclose(
|
||||||
|
_pad(items, "attention_mask", 0, "right"), torch.LongTensor([[0, 1, 1, 0, 0, 0], [0, 1, 1, 1, 1, 0]])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
def test_pipeline_image_padding(self):
|
||||||
|
import torch
|
||||||
|
|
||||||
|
items = [
|
||||||
|
{
|
||||||
|
"label": "label1",
|
||||||
|
"pixel_values": torch.zeros((1, 3, 10, 10)),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "label2",
|
||||||
|
"pixel_values": torch.zeros((1, 3, 10, 10)),
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"])
|
||||||
|
self.assertTrue(
|
||||||
|
torch.allclose(
|
||||||
|
_pad(items, "pixel_values", 10, "right"),
|
||||||
|
torch.zeros((2, 3, 10, 10)),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
def test_pipeline_offset_mapping(self):
|
||||||
|
import torch
|
||||||
|
|
||||||
|
items = [
|
||||||
|
{
|
||||||
|
"offset_mappings": torch.zeros([1, 11, 2], dtype=torch.long),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"offset_mappings": torch.zeros([1, 4, 2], dtype=torch.long),
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
torch.allclose(
|
||||||
|
_pad(items, "offset_mappings", 0, "right"),
|
||||||
|
torch.zeros((2, 11, 2), dtype=torch.long),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|||||||
@@ -54,8 +54,11 @@ class ConversationalPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM
|
|||||||
else []
|
else []
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
|
conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
|
||||||
|
return conversation_agent, [Conversation("Hi there!")]
|
||||||
|
|
||||||
|
def run_pipeline_test(self, conversation_agent, _):
|
||||||
# Simple
|
# Simple
|
||||||
outputs = conversation_agent(Conversation("Hi there!"))
|
outputs = conversation_agent(Conversation("Hi there!"))
|
||||||
self.assertEqual(outputs, Conversation(past_user_inputs=["Hi there!"], generated_responses=[ANY(str)]))
|
self.assertEqual(outputs, Conversation(past_user_inputs=["Hi there!"], generated_responses=[ANY(str)]))
|
||||||
|
|||||||
@@ -14,7 +14,15 @@
|
|||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from transformers import MODEL_MAPPING, TF_MODEL_MAPPING, CLIPConfig, FeatureExtractionPipeline, LxmertConfig, pipeline
|
from transformers import (
|
||||||
|
MODEL_MAPPING,
|
||||||
|
TF_MODEL_MAPPING,
|
||||||
|
CLIPConfig,
|
||||||
|
FeatureExtractionPipeline,
|
||||||
|
LxmertConfig,
|
||||||
|
Wav2Vec2Config,
|
||||||
|
pipeline,
|
||||||
|
)
|
||||||
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch
|
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch
|
||||||
|
|
||||||
from .test_pipelines_common import PipelineTestCaseMeta
|
from .test_pipelines_common import PipelineTestCaseMeta
|
||||||
@@ -61,12 +69,12 @@ class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
|
|||||||
raise ValueError("We expect lists of floats, nothing else")
|
raise ValueError("We expect lists of floats, nothing else")
|
||||||
return shape
|
return shape
|
||||||
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
if tokenizer is None:
|
if tokenizer is None:
|
||||||
self.skipTest("No tokenizer")
|
self.skipTest("No tokenizer")
|
||||||
return
|
return
|
||||||
|
|
||||||
elif isinstance(model.config, (LxmertConfig, CLIPConfig)):
|
elif isinstance(model.config, (LxmertConfig, CLIPConfig, Wav2Vec2Config)):
|
||||||
self.skipTest(
|
self.skipTest(
|
||||||
"This is an Lxmert bimodal model, we need to find a more consistent way to switch on those models."
|
"This is an Lxmert bimodal model, we need to find a more consistent way to switch on those models."
|
||||||
)
|
)
|
||||||
@@ -81,11 +89,12 @@ class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
|
|||||||
)
|
)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
feature_extractor = FeatureExtractionPipeline(
|
feature_extractor = FeatureExtractionPipeline(
|
||||||
model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
|
model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
|
||||||
)
|
)
|
||||||
|
return feature_extractor, ["This is a test", "This is another test"]
|
||||||
|
|
||||||
|
def run_pipeline_test(self, feature_extractor, examples):
|
||||||
outputs = feature_extractor("This is a test")
|
outputs = feature_extractor("This is a test")
|
||||||
|
|
||||||
shape = self.get_shape(outputs)
|
shape = self.get_shape(outputs)
|
||||||
|
|||||||
@@ -159,22 +159,32 @@ class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
|||||||
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="pt")
|
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="pt")
|
||||||
unmasker.tokenizer.pad_token_id = None
|
unmasker.tokenizer.pad_token_id = None
|
||||||
unmasker.tokenizer.pad_token = None
|
unmasker.tokenizer.pad_token = None
|
||||||
self.run_pipeline_test(unmasker.model, unmasker.tokenizer, None)
|
self.run_pipeline_test(unmasker, [])
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
def test_model_no_pad_tf(self):
|
def test_model_no_pad_tf(self):
|
||||||
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="tf")
|
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="tf")
|
||||||
unmasker.tokenizer.pad_token_id = None
|
unmasker.tokenizer.pad_token_id = None
|
||||||
unmasker.tokenizer.pad_token = None
|
unmasker.tokenizer.pad_token = None
|
||||||
self.run_pipeline_test(unmasker.model, unmasker.tokenizer, None)
|
self.run_pipeline_test(unmasker, [])
|
||||||
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
if tokenizer is None or tokenizer.mask_token_id is None:
|
if tokenizer is None or tokenizer.mask_token_id is None:
|
||||||
self.skipTest("The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
|
self.skipTest("The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
|
||||||
|
|
||||||
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
|
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
|
||||||
|
examples = [
|
||||||
|
f"This is another {tokenizer.mask_token} test",
|
||||||
|
]
|
||||||
|
return fill_masker, examples
|
||||||
|
|
||||||
outputs = fill_masker(f"This is a {tokenizer.mask_token}")
|
def run_pipeline_test(self, fill_masker, examples):
|
||||||
|
tokenizer = fill_masker.tokenizer
|
||||||
|
model = fill_masker.model
|
||||||
|
|
||||||
|
outputs = fill_masker(
|
||||||
|
f"This is a {tokenizer.mask_token}",
|
||||||
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
outputs,
|
outputs,
|
||||||
[
|
[
|
||||||
|
|||||||
@@ -44,9 +44,17 @@ else:
|
|||||||
class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||||
model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
|
model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
|
||||||
|
|
||||||
@require_datasets
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
|
||||||
image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor)
|
image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor)
|
||||||
|
examples = [
|
||||||
|
Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
|
||||||
|
"http://images.cocodataset.org/val2017/000000039769.jpg",
|
||||||
|
]
|
||||||
|
return image_classifier, examples
|
||||||
|
|
||||||
|
@require_datasets
|
||||||
|
def run_pipeline_test(self, image_classifier, examples):
|
||||||
outputs = image_classifier("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
outputs = image_classifier("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
|
|||||||
@@ -53,9 +53,12 @@ else:
|
|||||||
class ObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
class ObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||||
model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
|
model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
|
||||||
|
|
||||||
@require_datasets
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
|
||||||
object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor)
|
object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor)
|
||||||
|
return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
|
||||||
|
|
||||||
|
@require_datasets
|
||||||
|
def run_pipeline_test(self, object_detector, examples):
|
||||||
outputs = object_detector("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
|
outputs = object_detector("./tests/fixtures/tests_samples/COCO/000000039769.png", threshold=0.0)
|
||||||
|
|
||||||
self.assertGreater(len(outputs), 0)
|
self.assertGreater(len(outputs), 0)
|
||||||
|
|||||||
@@ -32,13 +32,20 @@ class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
|||||||
model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING
|
model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING
|
||||||
tf_model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
|
tf_model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
|
||||||
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
if isinstance(model.config, LxmertConfig):
|
if isinstance(model.config, LxmertConfig):
|
||||||
# This is an bimodal model, we need to find a more consistent way
|
# This is an bimodal model, we need to find a more consistent way
|
||||||
# to switch on those models.
|
# to switch on those models.
|
||||||
return
|
return None, None
|
||||||
question_answerer = QuestionAnsweringPipeline(model, tokenizer)
|
question_answerer = QuestionAnsweringPipeline(model, tokenizer)
|
||||||
|
|
||||||
|
examples = [
|
||||||
|
{"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
|
||||||
|
{"question": "In what field is HuggingFace ?", "context": "HuggingFace is an AI startup."},
|
||||||
|
]
|
||||||
|
return question_answerer, examples
|
||||||
|
|
||||||
|
def run_pipeline_test(self, question_answerer, _):
|
||||||
outputs = question_answerer(
|
outputs = question_answerer(
|
||||||
question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
|
question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -36,8 +36,12 @@ class SummarizationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMe
|
|||||||
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||||
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||||
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)
|
summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)
|
||||||
|
return summarizer, ["(CNN)The Palestinian Authority officially became", "Some other text"]
|
||||||
|
|
||||||
|
def run_pipeline_test(self, summarizer, _):
|
||||||
|
model = summarizer.model
|
||||||
|
|
||||||
outputs = summarizer("(CNN)The Palestinian Authority officially became")
|
outputs = summarizer("(CNN)The Palestinian Authority officially became")
|
||||||
self.assertEqual(outputs, [{"summary_text": ANY(str)}])
|
self.assertEqual(outputs, [{"summary_text": ANY(str)}])
|
||||||
|
|||||||
@@ -30,9 +30,11 @@ class Text2TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTest
|
|||||||
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||||
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||||
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)
|
generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)
|
||||||
|
return generator, ["Something to write", "Something else"]
|
||||||
|
|
||||||
|
def run_pipeline_test(self, generator, _):
|
||||||
outputs = generator("Something there")
|
outputs = generator("Something there")
|
||||||
self.assertEqual(outputs, [{"generated_text": ANY(str)}])
|
self.assertEqual(outputs, [{"generated_text": ANY(str)}])
|
||||||
# These are encoder decoder, they don't just append to incoming string
|
# These are encoder decoder, they don't just append to incoming string
|
||||||
|
|||||||
@@ -72,9 +72,12 @@ class TextClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestC
|
|||||||
outputs = text_classifier("Birds are a type of animal")
|
outputs = text_classifier("Birds are a type of animal")
|
||||||
self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
|
self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
|
||||||
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
|
text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
|
||||||
|
return text_classifier, ["HuggingFace is in", "This is another test"]
|
||||||
|
|
||||||
|
def run_pipeline_test(self, text_classifier, _):
|
||||||
|
model = text_classifier.model
|
||||||
# Small inputs because BartTokenizer tiny has maximum position embeddings = 22
|
# Small inputs because BartTokenizer tiny has maximum position embeddings = 22
|
||||||
valid_inputs = "HuggingFace is in"
|
valid_inputs = "HuggingFace is in"
|
||||||
outputs = text_classifier(valid_inputs)
|
outputs = text_classifier(valid_inputs)
|
||||||
|
|||||||
@@ -88,8 +88,14 @@ class TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
|
text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
|
||||||
|
return text_generator, ["This is a test", "Another test"]
|
||||||
|
|
||||||
|
def run_pipeline_test(self, text_generator, _):
|
||||||
|
model = text_generator.model
|
||||||
|
tokenizer = text_generator.tokenizer
|
||||||
|
|
||||||
outputs = text_generator("This is a test")
|
outputs = text_generator("This is a test")
|
||||||
self.assertEqual(outputs, [{"generated_text": ANY(str)}])
|
self.assertEqual(outputs, [{"generated_text": ANY(str)}])
|
||||||
self.assertTrue(outputs[0]["generated_text"].startswith("This is a test"))
|
self.assertTrue(outputs[0]["generated_text"].startswith("This is a test"))
|
||||||
|
|||||||
@@ -45,8 +45,13 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
|
|||||||
model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
|
model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
|
||||||
tf_model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
|
tf_model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
|
||||||
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
|
token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
|
||||||
|
return token_classifier, ["A simple string", "A simple string that is quite a bit longer"]
|
||||||
|
|
||||||
|
def run_pipeline_test(self, token_classifier, _):
|
||||||
|
model = token_classifier.model
|
||||||
|
tokenizer = token_classifier.tokenizer
|
||||||
|
|
||||||
outputs = token_classifier("A simple string")
|
outputs = token_classifier("A simple string")
|
||||||
self.assertIsInstance(outputs, list)
|
self.assertIsInstance(outputs, list)
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ from transformers import (
|
|||||||
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||||
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||||
MBart50TokenizerFast,
|
MBart50TokenizerFast,
|
||||||
|
MBartConfig,
|
||||||
MBartForConditionalGeneration,
|
MBartForConditionalGeneration,
|
||||||
TranslationPipeline,
|
TranslationPipeline,
|
||||||
pipeline,
|
pipeline,
|
||||||
@@ -34,14 +35,16 @@ class TranslationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta
|
|||||||
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||||
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
|
||||||
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
|
if isinstance(model.config, MBartConfig):
|
||||||
|
src_lang, tgt_lang = list(tokenizer.lang_code_to_id.keys())[:2]
|
||||||
|
translator = TranslationPipeline(model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang)
|
||||||
|
else:
|
||||||
translator = TranslationPipeline(model=model, tokenizer=tokenizer)
|
translator = TranslationPipeline(model=model, tokenizer=tokenizer)
|
||||||
try:
|
return translator, ["Some string", "Some other text"]
|
||||||
|
|
||||||
|
def run_pipeline_test(self, translator, _):
|
||||||
outputs = translator("Some string")
|
outputs = translator("Some string")
|
||||||
except ValueError:
|
|
||||||
# Triggered by m2m langages
|
|
||||||
src_lang, tgt_lang = list(translator.tokenizer.lang_code_to_id.keys())[:2]
|
|
||||||
outputs = translator("Some string", src_lang=src_lang, tgt_lang=tgt_lang)
|
|
||||||
self.assertEqual(outputs, [{"translation_text": ANY(str)}])
|
self.assertEqual(outputs, [{"translation_text": ANY(str)}])
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
|
|||||||
@@ -31,9 +31,13 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase, metaclass=PipelineT
|
|||||||
model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
|
model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
|
||||||
tf_model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
|
tf_model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
|
||||||
|
|
||||||
def run_pipeline_test(self, model, tokenizer, feature_extractor):
|
def get_test_pipeline(self, model, tokenizer, feature_extractor):
|
||||||
classifier = ZeroShotClassificationPipeline(model=model, tokenizer=tokenizer)
|
classifier = ZeroShotClassificationPipeline(
|
||||||
|
model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"]
|
||||||
|
)
|
||||||
|
return classifier, ["Who are you voting for in 2020?", "My stomach hurts."]
|
||||||
|
|
||||||
|
def run_pipeline_test(self, classifier, _):
|
||||||
outputs = classifier("Who are you voting for in 2020?", candidate_labels="politics")
|
outputs = classifier("Who are you voting for in 2020?", candidate_labels="politics")
|
||||||
self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
|
self.assertEqual(outputs, {"sequence": ANY(str), "labels": [ANY(str)], "scores": [ANY(float)]})
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user