Adding batch_size support for (almost) all pipelines (#13724)
* Tentative enabling of `batch_size` for pipelines. * Add systematic test for pipeline batching. * Enabling batch_size on almost all pipelines - Not `zero-shot` (it's already passing stuff as batched so trickier) - Not `QA` (preprocess uses squad features, we need to switch to real tensors at this boundary. * Adding `min_length_for_response` for conversational. * Making CTC, speech mappings avaiable regardless of framework. * Attempt at fixing automatic tests (ffmpeg not enabled for fast tests) * Removing ffmpeg dependency in tests. * Small fixes. * Slight cleanup. * Adding docs and adressing comments. * Quality. * Update docs/source/main_classes/pipelines.rst Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/question_answering.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/zero_shot_classification.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Improving docs. * Update docs/source/main_classes/pipelines.rst Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com> * N -> oberved_batch_size softmax trick. * Follow `padding_side`. * Supporting image pipeline batching (and padding). * Rename `unbatch` -> `loader_batch`. * unbatch_size forgot. * Custom padding for offset mappings. * Attempt to remove librosa. * Adding require_audio. * torchaudio. * Back to using datasets librosa. * Adding help to set a pad_token on the tokenizer. * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/pipelines/base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Quality. Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
This commit is contained in:
@@ -12,8 +12,10 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
import importlib
|
||||
import logging
|
||||
import random
|
||||
import string
|
||||
import unittest
|
||||
from abc import abstractmethod
|
||||
@@ -21,6 +23,7 @@ from functools import lru_cache
|
||||
from unittest import skipIf
|
||||
|
||||
from transformers import FEATURE_EXTRACTOR_MAPPING, TOKENIZER_MAPPING, AutoFeatureExtractor, AutoTokenizer, pipeline
|
||||
from transformers.pipelines.base import _pad
|
||||
from transformers.testing_utils import is_pipeline_test, require_torch
|
||||
|
||||
|
||||
@@ -73,6 +76,12 @@ def get_tiny_config_from_class(configuration_class):
|
||||
@lru_cache(maxsize=100)
|
||||
def get_tiny_tokenizer_from_checkpoint(checkpoint):
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
if tokenizer.vocab_size < 300:
|
||||
# Wav2Vec2ForCTC for instance
|
||||
# ByT5Tokenizer
|
||||
# all are already small enough and have no Fast version that can
|
||||
# be retrained
|
||||
return tokenizer
|
||||
logger.info("Training new from iterator ...")
|
||||
vocabulary = string.ascii_letters + string.digits + " "
|
||||
tokenizer = tokenizer.train_new_from_iterator(vocabulary, vocab_size=len(vocabulary), show_progress=False)
|
||||
@@ -87,6 +96,12 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config):
|
||||
feature_extractor = None
|
||||
if hasattr(tiny_config, "image_size") and feature_extractor:
|
||||
feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
|
||||
|
||||
# Speech2TextModel specific.
|
||||
if hasattr(tiny_config, "input_feat_per_channel") and feature_extractor:
|
||||
feature_extractor = feature_extractor.__class__(
|
||||
feature_size=tiny_config.input_feat_per_channel, num_mel_bins=tiny_config.input_feat_per_channel
|
||||
)
|
||||
return feature_extractor
|
||||
|
||||
|
||||
@@ -136,7 +151,26 @@ class PipelineTestCaseMeta(type):
|
||||
else:
|
||||
tokenizer = None
|
||||
feature_extractor = get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config)
|
||||
self.run_pipeline_test(model, tokenizer, feature_extractor)
|
||||
pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor)
|
||||
if pipeline is None:
|
||||
# The test can disable itself, but it should be very marginal
|
||||
# Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
|
||||
return
|
||||
self.run_pipeline_test(pipeline, examples)
|
||||
|
||||
def run_batch_test(pipeline, examples):
|
||||
# Need to copy because `Conversation` are stateful
|
||||
if pipeline.tokenizer is not None and pipeline.tokenizer.pad_token_id is None:
|
||||
return # No batching for this and it's OK
|
||||
|
||||
# 10 examples with batch size 4 means there needs to be a unfinished batch
|
||||
# which is important for the unbatcher
|
||||
dataset = [copy.deepcopy(random.choice(examples)) for i in range(10)]
|
||||
|
||||
for item in pipeline(dataset, batch_size=4):
|
||||
pass
|
||||
|
||||
run_batch_test(pipeline, examples)
|
||||
|
||||
return test
|
||||
|
||||
@@ -211,3 +245,85 @@ class CommonPipelineTest(unittest.TestCase):
|
||||
dataset = MyDataset()
|
||||
for output in text_classifier(dataset):
|
||||
self.assertEqual(output, {"label": ANY(str), "score": ANY(float)})
|
||||
|
||||
|
||||
@is_pipeline_test
|
||||
class PipelinePadTest(unittest.TestCase):
|
||||
@require_torch
|
||||
def test_pipeline_padding(self):
|
||||
import torch
|
||||
|
||||
items = [
|
||||
{
|
||||
"label": "label1",
|
||||
"input_ids": torch.LongTensor([[1, 23, 24, 2]]),
|
||||
"attention_mask": torch.LongTensor([[0, 1, 1, 0]]),
|
||||
},
|
||||
{
|
||||
"label": "label2",
|
||||
"input_ids": torch.LongTensor([[1, 23, 24, 43, 44, 2]]),
|
||||
"attention_mask": torch.LongTensor([[0, 1, 1, 1, 1, 0]]),
|
||||
},
|
||||
]
|
||||
|
||||
self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"])
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
_pad(items, "input_ids", 10, "right"),
|
||||
torch.LongTensor([[1, 23, 24, 2, 10, 10], [1, 23, 24, 43, 44, 2]]),
|
||||
)
|
||||
)
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
_pad(items, "input_ids", 10, "left"),
|
||||
torch.LongTensor([[10, 10, 1, 23, 24, 2], [1, 23, 24, 43, 44, 2]]),
|
||||
)
|
||||
)
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
_pad(items, "attention_mask", 0, "right"), torch.LongTensor([[0, 1, 1, 0, 0, 0], [0, 1, 1, 1, 1, 0]])
|
||||
)
|
||||
)
|
||||
|
||||
@require_torch
|
||||
def test_pipeline_image_padding(self):
|
||||
import torch
|
||||
|
||||
items = [
|
||||
{
|
||||
"label": "label1",
|
||||
"pixel_values": torch.zeros((1, 3, 10, 10)),
|
||||
},
|
||||
{
|
||||
"label": "label2",
|
||||
"pixel_values": torch.zeros((1, 3, 10, 10)),
|
||||
},
|
||||
]
|
||||
|
||||
self.assertEqual(_pad(items, "label", 0, "right"), ["label1", "label2"])
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
_pad(items, "pixel_values", 10, "right"),
|
||||
torch.zeros((2, 3, 10, 10)),
|
||||
)
|
||||
)
|
||||
|
||||
@require_torch
|
||||
def test_pipeline_offset_mapping(self):
|
||||
import torch
|
||||
|
||||
items = [
|
||||
{
|
||||
"offset_mappings": torch.zeros([1, 11, 2], dtype=torch.long),
|
||||
},
|
||||
{
|
||||
"offset_mappings": torch.zeros([1, 4, 2], dtype=torch.long),
|
||||
},
|
||||
]
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
_pad(items, "offset_mappings", 0, "right"),
|
||||
torch.zeros((2, 11, 2), dtype=torch.long),
|
||||
),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user