diff --git a/README.md b/README.md index a28338c0a1..3f5389f8a7 100644 --- a/README.md +++ b/README.md @@ -521,8 +521,9 @@ You can create `Pipeline` objects for the following down-stream tasks: - `feature-extraction`: Generates a tensor representation for the input sequence - `ner`: Generates named entity mapping for each word in the input sequence. - `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence. - - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question - in the context. + - `text-classification`: Initialize a `TextClassificationPipeline` directly, or see `sentiment-analysis` for an example. + - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question in the context. + - `fill-mask`: Takes an input sequence containing a masked token (e.g. ``) and return list of most probable filled sequences, with their probabilities. ```python from transformers import pipeline diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 2a2fef742e..13990695ce 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -93,6 +93,7 @@ from .modeling_tf_pytorch_utils import ( from .pipelines import ( CsvPipelineDataFormat, FeatureExtractionPipeline, + FillMaskPipeline, JsonPipelineDataFormat, NerPipeline, PipedPipelineDataFormat, diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 4eb5d5d1a6..d694afbaa5 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -28,7 +28,10 @@ from typing import Dict, List, Optional, Tuple, Union import numpy as np from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig +from .configuration_distilbert import DistilBertConfig +from .configuration_roberta import RobertaConfig from .configuration_utils import PretrainedConfig +from .configuration_xlm import XLMConfig from .data import SquadExample, squad_convert_examples_to_features from .file_utils import is_tf_available, is_torch_available from .modelcard import ModelCard @@ -44,6 +47,7 @@ if is_tf_available(): TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification, + TFAutoModelWithLMHead, ) if is_torch_available(): @@ -53,6 +57,7 @@ if is_torch_available(): AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification, + AutoModelWithLMHead, ) @@ -64,7 +69,7 @@ def get_framework(model=None): If both frameworks are installed and no specific model is provided, defaults to using PyTorch. """ if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str): - # Both framework are available but the use supplied a model class instance. + # Both framework are available but the user supplied a model class instance. # Try to guess which framework to use from the model classname framework = "tf" if model.__class__.__name__.startswith("TF") else "pt" elif not is_tf_available() and not is_torch_available(): @@ -364,7 +369,6 @@ class Pipeline(_ScikitCompat): def predict(self, X): """ Scikit / Keras interface to transformers' pipelines. This method will forward to __call__(). - Se """ return self(X=X) @@ -406,9 +410,8 @@ class Pipeline(_ScikitCompat): dict holding all the required parameters for model's forward """ args = ["input_ids", "attention_mask"] - model_type = type(self.model).__name__.lower() - if "distilbert" not in model_type and "xlm" not in model_type: + if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig)): args += ["token_type_ids"] # PR #1548 (CLI) There is an issue with attention_mask @@ -420,7 +423,10 @@ class Pipeline(_ScikitCompat): else: return {k: [feature[k] for feature in features] for k in args} - def __call__(self, *texts, **kwargs): + def _parse_and_tokenize(self, *texts, **kwargs): + """ + Parse arguments and tokenize + """ # Parse arguments inputs = self._args_parser(*texts, **kwargs) inputs = self.tokenizer.batch_encode_plus( @@ -429,13 +435,19 @@ class Pipeline(_ScikitCompat): # Filter out features not available on specific models inputs = self.inputs_for_model(inputs) + + return inputs + + def __call__(self, *texts, **kwargs): + inputs = self._parse_and_tokenize(*texts, **kwargs) return self._forward(inputs) - def _forward(self, inputs): + def _forward(self, inputs, return_tensors=False): """ Internal framework specific forward dispatching. Args: inputs: dict holding all the keyworded arguments for required by the model forward method. + return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array. Returns: Numpy array """ @@ -449,7 +461,10 @@ class Pipeline(_ScikitCompat): inputs = self.ensure_tensor_on_device(**inputs) predictions = self.model(**inputs)[0].cpu() - return predictions.numpy() + if return_tensors: + return predictions + else: + return predictions.numpy() class FeatureExtractionPipeline(Pipeline): @@ -491,6 +506,71 @@ class TextClassificationPipeline(Pipeline): return [{"label": self.model.config.id2label[item.argmax()], "score": item.max()} for item in scores] +class FillMaskPipeline(Pipeline): + """ + Masked language modeling prediction pipeline using ModelWithLMHead head. + """ + + def __init__( + self, + model, + tokenizer: PreTrainedTokenizer = None, + modelcard: ModelCard = None, + framework: Optional[str] = None, + args_parser: ArgumentHandler = None, + device: int = -1, + topk=5, + ): + super().__init__( + model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + args_parser=args_parser, + device=device, + binary_output=True, + ) + + self.topk = topk + + def __call__(self, *args, **kwargs): + inputs = self._parse_and_tokenize(*args, **kwargs) + outputs = self._forward(inputs, return_tensors=True) + + results = [] + batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0) + + for i in range(batch_size): + input_ids = inputs["input_ids"][i] + result = [] + + if self.framework == "tf": + masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy().item() + logits = outputs[i, masked_index, :] + probs = tf.nn.softmax(logits) + topk = tf.math.top_k(probs, k=self.topk) + values, predictions = topk.values.numpy(), topk.indices.numpy() + else: + masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero().item() + logits = outputs[i, masked_index, :] + probs = logits.softmax(dim=0) + values, predictions = probs.topk(self.topk) + + for v, p in zip(values.tolist(), predictions.tolist()): + tokens = input_ids.numpy() + tokens[masked_index] = p + # Filter padding out: + tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)] + result.append({"sequence": self.tokenizer.decode(tokens), "score": v, "token": p}) + + # Append + results += [result] + + if len(results) == 1: + return results[0] + return results + + class NerPipeline(Pipeline): """ Named Entity Recognition pipeline using ModelForTokenClassification head. @@ -523,7 +603,8 @@ class NerPipeline(Pipeline): self.ignore_labels = ignore_labels def __call__(self, *texts, **kwargs): - inputs, answers = self._args_parser(*texts, **kwargs), [] + inputs = self._args_parser(*texts, **kwargs) + answers = [] for sentence in inputs: # Manage correct placement of the tensors @@ -903,6 +984,16 @@ SUPPORTED_TASKS = { "tokenizer": "distilbert-base-uncased", }, }, + "fill-mask": { + "impl": FillMaskPipeline, + "tf": TFAutoModelWithLMHead if is_tf_available() else None, + "pt": AutoModelWithLMHead if is_torch_available() else None, + "default": { + "model": {"pt": "distilroberta-base", "tf": "distilroberta-base"}, + "config": None, + "tokenizer": "distilroberta-base", + }, + }, } diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 717744bc94..3a4535d153 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -1,7 +1,8 @@ import unittest -from typing import Iterable +from typing import Iterable, List, Optional from transformers import pipeline +from transformers.pipelines import Pipeline from .utils import require_tf, require_torch @@ -62,9 +63,25 @@ TEXT_CLASSIF_FINETUNED_MODELS = { ) } +FILL_MASK_FINETUNED_MODELS = { + ("distilroberta-base", "distilroberta-base", None), +} + +TF_FILL_MASK_FINETUNED_MODELS = { + ("distilroberta-base", "distilroberta-base", None), +} + class MonoColumnInputTestCase(unittest.TestCase): - def _test_mono_column_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]): + def _test_mono_column_pipeline( + self, + nlp: Pipeline, + valid_inputs: List, + invalid_inputs: List, + output_keys: Iterable[str], + expected_multi_result: Optional[List] = None, + expected_check_keys: Optional[List[str]] = None, + ): self.assertIsNotNone(nlp) mono_result = nlp(valid_inputs[0]) @@ -81,6 +98,13 @@ class MonoColumnInputTestCase(unittest.TestCase): self.assertIsInstance(multi_result, list) self.assertIsInstance(multi_result[0], (dict, list)) + if expected_multi_result is not None: + for result, expect in zip(multi_result, expected_multi_result): + for key in expected_check_keys or []: + self.assertEqual( + set([o[key] for o in result]), set([o[key] for o in expect]), + ) + if isinstance(multi_result[0], list): multi_result = multi_result[0] @@ -110,7 +134,7 @@ class MonoColumnInputTestCase(unittest.TestCase): @require_torch def test_sentiment_analysis(self): - mandatory_keys = {"label"} + mandatory_keys = {"label", "score"} valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] invalid_inputs = [None] for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS: @@ -119,7 +143,7 @@ class MonoColumnInputTestCase(unittest.TestCase): @require_tf def test_tf_sentiment_analysis(self): - mandatory_keys = {"label"} + mandatory_keys = {"label", "score"} valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] invalid_inputs = [None] for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS: @@ -127,21 +151,87 @@ class MonoColumnInputTestCase(unittest.TestCase): self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) @require_torch - def test_features_extraction(self): + def test_feature_extraction(self): valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] invalid_inputs = [None] for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS: - nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer) + nlp = pipeline(task="feature-extraction", model=model, config=config, tokenizer=tokenizer) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) @require_tf - def test_tf_features_extraction(self): + def test_tf_feature_extraction(self): valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] invalid_inputs = [None] for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS: - nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer) + nlp = pipeline(task="feature-extraction", model=model, config=config, tokenizer=tokenizer) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) + @require_torch + def test_fill_mask(self): + mandatory_keys = {"sequence", "score", "token"} + valid_inputs = [ + "My name is ", + "The largest city in France is ", + ] + invalid_inputs = [None] + expected_multi_result = [ + [ + {"score": 0.008698059245944023, "sequence": "My name is John", "token": 610}, + {"score": 0.007750614080578089, "sequence": "My name is Chris", "token": 1573}, + ], + [ + {"score": 0.2721288502216339, "sequence": "The largest city in France is Paris", "token": 2201}, + { + "score": 0.19764970242977142, + "sequence": "The largest city in France is Lyon", + "token": 12790, + }, + ], + ] + for tokenizer, model, config in FILL_MASK_FINETUNED_MODELS: + nlp = pipeline(task="fill-mask", model=model, config=config, tokenizer=tokenizer, topk=2) + self._test_mono_column_pipeline( + nlp, + valid_inputs, + invalid_inputs, + mandatory_keys, + expected_multi_result=expected_multi_result, + expected_check_keys=["sequence"], + ) + + @require_tf + def test_tf_fill_mask(self): + mandatory_keys = {"sequence", "score", "token"} + valid_inputs = [ + "My name is ", + "The largest city in France is ", + ] + invalid_inputs = [None] + expected_multi_result = [ + [ + {"score": 0.008698059245944023, "sequence": "My name is John", "token": 610}, + {"score": 0.007750614080578089, "sequence": "My name is Chris", "token": 1573}, + ], + [ + {"score": 0.2721288502216339, "sequence": "The largest city in France is Paris", "token": 2201}, + { + "score": 0.19764970242977142, + "sequence": "The largest city in France is Lyon", + "token": 12790, + }, + ], + ] + for tokenizer, model, config in TF_FILL_MASK_FINETUNED_MODELS: + nlp = pipeline(task="fill-mask", model=model, config=config, tokenizer=tokenizer, topk=2) + self._test_mono_column_pipeline( + nlp, + valid_inputs, + invalid_inputs, + mandatory_keys, + expected_multi_result=expected_multi_result, + expected_check_keys=["sequence"], + ) + class MultiColumnInputTestCase(unittest.TestCase): def _test_multicolumn_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):