From 9956c2bc984d0bf65c52b22b98b882317e7264af Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 27 Aug 2024 14:39:57 +0100 Subject: [PATCH] Add a fix for custom code tokenizers in pipelines (#32300) * Add a fix for the case when tokenizers are passed as a string * Support image processors and feature extractors as well * Reverting load_feature_extractor and load_image_processor * Add test * Test is torch-only * Add tests for preprocessors and feature extractors and move test * Extremely experimental fix * Revert that change, wrong branch! * Typo! * Split tests --- src/transformers/pipelines/__init__.py | 6 +++- tests/pipelines/test_pipelines_common.py | 39 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index 9bc0a1cf8b..87baa6b99a 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -904,7 +904,11 @@ def pipeline( model_config = model.config hub_kwargs["_commit_hash"] = model.config._commit_hash - load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None + load_tokenizer = ( + type(model_config) in TOKENIZER_MAPPING + or model_config.tokenizer_class is not None + or isinstance(tokenizer, str) + ) load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 95349a8335..f4aa1a27f5 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -26,10 +26,13 @@ from huggingface_hub import HfFolder, delete_repo from requests.exceptions import HTTPError from transformers import ( + AutomaticSpeechRecognitionPipeline, AutoModelForSequenceClassification, AutoTokenizer, DistilBertForSequenceClassification, + MaskGenerationPipeline, TextClassificationPipeline, + TextGenerationPipeline, TFAutoModelForSequenceClassification, pipeline, ) @@ -859,6 +862,42 @@ class CustomPipelineTest(unittest.TestCase): self.assertEqual(self.COUNT, 1) + @require_torch + def test_custom_code_with_string_tokenizer(self): + # This test checks for an edge case - tokenizer loading used to fail when using a custom code model + # with a separate tokenizer that was passed as a repo name rather than a tokenizer object. + # See https://github.com/huggingface/transformers/issues/31669 + text_generator = pipeline( + "text-generation", + model="Rocketknight1/fake-custom-model-test", + tokenizer="Rocketknight1/fake-custom-model-test", + trust_remote_code=True, + ) + + self.assertIsInstance(text_generator, TextGenerationPipeline) # Assert successful loading + + @require_torch + def test_custom_code_with_string_feature_extractor(self): + speech_recognizer = pipeline( + "automatic-speech-recognition", + model="Rocketknight1/fake-custom-wav2vec2", + feature_extractor="Rocketknight1/fake-custom-wav2vec2", + trust_remote_code=True, + ) + + self.assertIsInstance(speech_recognizer, AutomaticSpeechRecognitionPipeline) # Assert successful loading + + @require_torch + def test_custom_code_with_string_preprocessor(self): + mask_generator = pipeline( + "mask-generation", + model="Rocketknight1/fake-custom-sam", + processor="Rocketknight1/fake-custom-sam", + trust_remote_code=True, + ) + + self.assertIsInstance(mask_generator, MaskGenerationPipeline) # Assert successful loading + @require_torch @is_staging_test