From 99207bd112560e0d1cccc0d7cd3e4f3815c27aae Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 3 Jun 2020 03:51:31 -0400 Subject: [PATCH] Pipelines: miscellanea of QoL improvements and small features... (#4632) * [hf_api] Attach all unknown attributes for future-proof compatibility * [Pipeline] NerPipeline is really a TokenClassificationPipeline * modelcard.py: I don't think we need to force the download * Remove config, tokenizer from SUPPORTED_TASKS as we're moving to one model = one weight + one tokenizer * FillMaskPipeline: also output token in string form * TextClassificationPipeline: option to return all scores, not just the argmax * Update docs/source/main_classes/pipelines.rst --- docs/source/main_classes/pipelines.rst | 14 ++--- src/transformers/hf_api.py | 8 ++- src/transformers/modelcard.py | 4 +- src/transformers/pipelines.py | 82 ++++++++++++-------------- 4 files changed, 52 insertions(+), 56 deletions(-) diff --git a/docs/source/main_classes/pipelines.rst b/docs/source/main_classes/pipelines.rst index 0ef9858286..9a5fbb2e70 100644 --- a/docs/source/main_classes/pipelines.rst +++ b/docs/source/main_classes/pipelines.rst @@ -8,7 +8,7 @@ Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction an There are two categories of pipeline abstractions to be aware about: - The :class:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines -- The other task-specific pipelines, such as :class:`~transformers.NerPipeline` +- The other task-specific pipelines, such as :class:`~transformers.TokenClassificationPipeline` or :class:`~transformers.QuestionAnsweringPipeline` The pipeline abstraction @@ -30,15 +30,15 @@ Parent class: Pipeline .. autoclass:: transformers.Pipeline :members: predict, transform, save_pretrained -NerPipeline -========================================== - -.. autoclass:: transformers.NerPipeline - TokenClassificationPipeline ========================================== -This class is an alias of the :class:`~transformers.NerPipeline` defined above. Please refer to that pipeline for +.. autoclass:: transformers.TokenClassificationPipeline + +NerPipeline +========================================== + +This class is an alias of the :class:`~transformers.TokenClassificationPipeline` defined above. Please refer to that pipeline for documentation and usage examples. FillMaskPipeline diff --git a/src/transformers/hf_api.py b/src/transformers/hf_api.py index bf1ea4c727..f9fd545034 100644 --- a/src/transformers/hf_api.py +++ b/src/transformers/hf_api.py @@ -64,6 +64,8 @@ class S3Object: self.lastModified = lastModified self.size = size self.rfilename = rfilename + for k, v in kwargs.items(): + setattr(self, k, v) class ModelInfo: @@ -78,7 +80,7 @@ class ModelInfo: author: Optional[str] = None, downloads: Optional[int] = None, tags: List[str] = [], - siblings: List[Dict] = [], # list of files that constitute the model + siblings: Optional[List[Dict]] = None, # list of files that constitute the model **kwargs ): self.modelId = modelId @@ -86,7 +88,9 @@ class ModelInfo: self.author = author self.downloads = downloads self.tags = tags - self.siblings = [S3Object(**x) for x in siblings] + self.siblings = [S3Object(**x) for x in siblings] if siblings is not None else None + for k, v in kwargs.items(): + setattr(self, k, v) class HfApi: diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py index fe218c4de0..d37fa76a4a 100644 --- a/src/transformers/modelcard.py +++ b/src/transformers/modelcard.py @@ -149,9 +149,7 @@ class ModelCard: try: # Load from URL or cache if already cached - resolved_model_card_file = cached_path( - model_card_file, cache_dir=cache_dir, force_download=True, proxies=proxies, resume_download=False - ) + resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, proxies=proxies) if resolved_model_card_file is None: raise EnvironmentError if resolved_model_card_file == model_card_file: diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 35a0872a02..bbc626e0bc 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -28,7 +28,7 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, import numpy as np -from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig +from .configuration_auto import AutoConfig from .configuration_utils import PretrainedConfig from .data import SquadExample, squad_convert_examples_to_features from .file_utils import is_tf_available, is_torch_available @@ -717,10 +717,23 @@ class TextClassificationPipeline(Pipeline): on the associated CUDA device id. """ + def __init__(self, return_all_scores: bool = False, **kwargs): + super().__init__(**kwargs) + + self.return_all_scores = return_all_scores + def __call__(self, *args, **kwargs): outputs = super().__call__(*args, **kwargs) scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True) - return [{"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores] + if self.return_all_scores: + return [ + [{"label": self.model.config.id2label[i], "score": score} for i, score in enumerate(item)] + for item in scores + ] + else: + return [ + {"label": self.model.config.id2label[item.argmax()], "score": item.max().item()} for item in scores + ] class FillMaskPipeline(Pipeline): @@ -813,7 +826,14 @@ class FillMaskPipeline(Pipeline): tokens[masked_index] = p # Filter padding out: tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)] - result.append({"sequence": self.tokenizer.decode(tokens), "score": v, "token": p}) + result.append( + { + "sequence": self.tokenizer.decode(tokens), + "score": v, + "token": p, + "token_str": self.tokenizer.convert_ids_to_tokens(p), + } + ) # Append results += [result] @@ -823,7 +843,7 @@ class FillMaskPipeline(Pipeline): return results -class NerPipeline(Pipeline): +class TokenClassificationPipeline(Pipeline): """ Named Entity Recognition pipeline using ModelForTokenClassification head. See the `named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information. @@ -987,7 +1007,7 @@ class NerPipeline(Pipeline): return entity_group -TokenClassificationPipeline = NerPipeline +NerPipeline = TokenClassificationPipeline class QuestionAnsweringArgumentHandler(ArgumentHandler): @@ -1577,11 +1597,7 @@ SUPPORTED_TASKS = { "impl": FeatureExtractionPipeline, "tf": TFAutoModel if is_tf_available() else None, "pt": AutoModel if is_torch_available() else None, - "default": { - "model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}, - "config": None, - "tokenizer": "distilbert-base-cased", - }, + "default": {"model": {"pt": "distilbert-base-cased", "tf": "distilbert-base-cased"}}, }, "sentiment-analysis": { "impl": TextClassificationPipeline, @@ -1592,12 +1608,10 @@ SUPPORTED_TASKS = { "pt": "distilbert-base-uncased-finetuned-sst-2-english", "tf": "distilbert-base-uncased-finetuned-sst-2-english", }, - "config": "distilbert-base-uncased-finetuned-sst-2-english", - "tokenizer": "distilbert-base-uncased", }, }, "ner": { - "impl": NerPipeline, + "impl": TokenClassificationPipeline, "tf": TFAutoModelForTokenClassification if is_tf_available() else None, "pt": AutoModelForTokenClassification if is_torch_available() else None, "default": { @@ -1605,8 +1619,6 @@ SUPPORTED_TASKS = { "pt": "dbmdz/bert-large-cased-finetuned-conll03-english", "tf": "dbmdz/bert-large-cased-finetuned-conll03-english", }, - "config": "dbmdz/bert-large-cased-finetuned-conll03-english", - "tokenizer": "bert-large-cased", }, }, "question-answering": { @@ -1615,61 +1627,43 @@ SUPPORTED_TASKS = { "pt": AutoModelForQuestionAnswering if is_torch_available() else None, "default": { "model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"}, - "config": None, - "tokenizer": ("distilbert-base-cased", {"use_fast": False}), }, }, "fill-mask": { "impl": FillMaskPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, - "default": { - "model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}, - "config": None, - "tokenizer": ("distilroberta-base", {"use_fast": False}), - }, + "default": {"model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}}, }, "summarization": { "impl": SummarizationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, - "default": {"model": {"pt": "facebook/bart-large-cnn", "tf": "t5-small"}, "config": None, "tokenizer": None}, + "default": {"model": {"pt": "facebook/bart-large-cnn", "tf": "t5-small"}}, }, "translation_en_to_fr": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, - "default": { - "model": {"pt": "t5-base", "tf": "t5-base"}, - "config": None, - "tokenizer": ("t5-base", {"use_fast": False}), - }, + "default": {"model": {"pt": "t5-base", "tf": "t5-base"}}, }, "translation_en_to_de": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, - "default": { - "model": {"pt": "t5-base", "tf": "t5-base"}, - "config": None, - "tokenizer": ("t5-base", {"use_fast": False}), - }, + "default": {"model": {"pt": "t5-base", "tf": "t5-base"}}, }, "translation_en_to_ro": { "impl": TranslationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, - "default": { - "model": {"pt": "t5-base", "tf": "t5-base"}, - "config": None, - "tokenizer": ("t5-base", {"use_fast": False}), - }, + "default": {"model": {"pt": "t5-base", "tf": "t5-base"}}, }, "text-generation": { "impl": TextGenerationPipeline, "tf": TFAutoModelWithLMHead if is_tf_available() else None, "pt": AutoModelWithLMHead if is_torch_available() else None, - "default": {"model": {"pt": "gpt2", "tf": "gpt2"}, "config": None, "tokenizer": "gpt2"}, + "default": {"model": {"pt": "gpt2", "tf": "gpt2"}}, }, } @@ -1698,11 +1692,12 @@ def pipeline( - "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline` - "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline` - - "ner": will return a :class:`~transformers.NerPipeline` + - "ner": will return a :class:`~transformers.TokenClassificationPipeline` - "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline` - "fill-mask": will return a :class:`~transformers.FillMaskPipeline` - "summarization": will return a :class:`~transformers.SummarizationPipeline` - "translation_xx_to_yy": will return a :class:`~transformers.TranslationPipeline` + - "text-generation": will return a :class:`~transformers.TextGenerationPipeline` model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`): The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a model identifier or an actual pre-trained model inheriting from @@ -1759,14 +1754,13 @@ def pipeline( # Use default model/config/tokenizer for the task if no model is provided if model is None: - models, config, tokenizer = [targeted_task["default"][k] for k in ["model", "config", "tokenizer"]] - model = models[framework] + model = targeted_task["default"]["model"][framework] # Try to infer tokenizer from model or config name (if provided as str) if tokenizer is None: - if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: + if isinstance(model, str): tokenizer = model - elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: + elif isinstance(config, str): tokenizer = config else: # Impossible to guest what is the right tokenizer here