Add a fix for custom code tokenizers in pipelines (#32300)

* Add a fix for the case when tokenizers are passed as a string * Support image processors and feature extractors as well * Reverting load_feature_extractor and load_image_processor * Add test * Test is torch-only * Add tests for preprocessors and feature extractors and move test * Extremely experimental fix * Revert that change, wrong branch! * Typo! * Split tests
2024-08-27 14:39:57 +01:00
parent 834ec7b1cc
commit 9956c2bc98
2 changed files with 44 additions and 1 deletions
--- a/src/transformers/pipelines/init.py
+++ b/src/transformers/pipelines/init.py
@@ -904,7 +904,11 @@ def pipeline(

    model_config = model.config
    hub_kwargs["_commit_hash"] = model.config._commit_hash
-    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
+    load_tokenizer = (
+        type(model_config) in TOKENIZER_MAPPING
+        or model_config.tokenizer_class is not None
+        or isinstance(tokenizer, str)
+    )
    load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
    load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None

--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -26,10 +26,13 @@ from huggingface_hub import HfFolder, delete_repo
 from requests.exceptions import HTTPError

 from transformers import (
+    AutomaticSpeechRecognitionPipeline,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DistilBertForSequenceClassification,
+    MaskGenerationPipeline,
    TextClassificationPipeline,
+    TextGenerationPipeline,
    TFAutoModelForSequenceClassification,
    pipeline,
 )
@@ -859,6 +862,42 @@ class CustomPipelineTest(unittest.TestCase):

        self.assertEqual(self.COUNT, 1)

+    @require_torch
+    def test_custom_code_with_string_tokenizer(self):
+        # This test checks for an edge case - tokenizer loading used to fail when using a custom code model
+        # with a separate tokenizer that was passed as a repo name rather than a tokenizer object.
+        # See https://github.com/huggingface/transformers/issues/31669
+        text_generator = pipeline(
+            "text-generation",
+            model="Rocketknight1/fake-custom-model-test",
+            tokenizer="Rocketknight1/fake-custom-model-test",
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(text_generator, TextGenerationPipeline)  # Assert successful loading
+
+    @require_torch
+    def test_custom_code_with_string_feature_extractor(self):
+        speech_recognizer = pipeline(
+            "automatic-speech-recognition",
+            model="Rocketknight1/fake-custom-wav2vec2",
+            feature_extractor="Rocketknight1/fake-custom-wav2vec2",
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(speech_recognizer, AutomaticSpeechRecognitionPipeline)  # Assert successful loading
+
+    @require_torch
+    def test_custom_code_with_string_preprocessor(self):
+        mask_generator = pipeline(
+            "mask-generation",
+            model="Rocketknight1/fake-custom-sam",
+            processor="Rocketknight1/fake-custom-sam",
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(mask_generator, MaskGenerationPipeline)  # Assert successful loading
+

@require_torch
@is_staging_test