From 9956c2bc984d0bf65c52b22b98b882317e7264af Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 27 Aug 2024 14:39:57 +0100
Subject: [PATCH] Add a fix for custom code tokenizers in pipelines (#32300)

* Add a fix for the case when tokenizers are passed as a string

* Support image processors and feature extractors as well

* Reverting load_feature_extractor and load_image_processor

* Add test

* Test is torch-only

* Add tests for preprocessors and feature extractors and move test

* Extremely experimental fix

* Revert that change, wrong branch!

* Typo!

* Split tests
---
 src/transformers/pipelines/__init__.py   |  6 +++-
 tests/pipelines/test_pipelines_common.py | 39 ++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 9bc0a1cf8b..87baa6b99a 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -904,7 +904,11 @@ def pipeline(
 
     model_config = model.config
     hub_kwargs["_commit_hash"] = model.config._commit_hash
-    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
+    load_tokenizer = (
+        type(model_config) in TOKENIZER_MAPPING
+        or model_config.tokenizer_class is not None
+        or isinstance(tokenizer, str)
+    )
     load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
     load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
 
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 95349a8335..f4aa1a27f5 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -26,10 +26,13 @@ from huggingface_hub import HfFolder, delete_repo
 from requests.exceptions import HTTPError
 
 from transformers import (
+    AutomaticSpeechRecognitionPipeline,
     AutoModelForSequenceClassification,
     AutoTokenizer,
     DistilBertForSequenceClassification,
+    MaskGenerationPipeline,
     TextClassificationPipeline,
+    TextGenerationPipeline,
     TFAutoModelForSequenceClassification,
     pipeline,
 )
@@ -859,6 +862,42 @@ class CustomPipelineTest(unittest.TestCase):
 
         self.assertEqual(self.COUNT, 1)
 
+    @require_torch
+    def test_custom_code_with_string_tokenizer(self):
+        # This test checks for an edge case - tokenizer loading used to fail when using a custom code model
+        # with a separate tokenizer that was passed as a repo name rather than a tokenizer object.
+        # See https://github.com/huggingface/transformers/issues/31669
+        text_generator = pipeline(
+            "text-generation",
+            model="Rocketknight1/fake-custom-model-test",
+            tokenizer="Rocketknight1/fake-custom-model-test",
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(text_generator, TextGenerationPipeline)  # Assert successful loading
+
+    @require_torch
+    def test_custom_code_with_string_feature_extractor(self):
+        speech_recognizer = pipeline(
+            "automatic-speech-recognition",
+            model="Rocketknight1/fake-custom-wav2vec2",
+            feature_extractor="Rocketknight1/fake-custom-wav2vec2",
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(speech_recognizer, AutomaticSpeechRecognitionPipeline)  # Assert successful loading
+
+    @require_torch
+    def test_custom_code_with_string_preprocessor(self):
+        mask_generator = pipeline(
+            "mask-generation",
+            model="Rocketknight1/fake-custom-sam",
+            processor="Rocketknight1/fake-custom-sam",
+            trust_remote_code=True,
+        )
+
+        self.assertIsInstance(mask_generator, MaskGenerationPipeline)  # Assert successful loading
+
 
 @require_torch
 @is_staging_test