fill_mask helper (#2576)

* fill_mask helper

* [poc] FillMaskPipeline

* Revert "[poc] FillMaskPipeline"

This reverts commit 67eeea55b0f97b46c2b828de0f4ee97d87338335.

* Revert "fill_mask helper"

This reverts commit cacc17b884e14bb6b07989110ffe884ad9e36eaa.

* README: clarify that Pipelines can also do text-classification

cf. question at the AI&ML meetup last week, @mfuntowicz

* Fix test: test feature-extraction pipeline

* Test tweaks

* Slight refactor of existing pipeline (in preparation of new FillMaskPipeline)

* Extraneous doc

* More robust way of doing this

@mfuntowicz as we don't rely on the model name anymore (see AutoConfig)

* Also add RobertaConfig as a quickfix for wrong token_type_ids

* cs

* [BIG] FillMaskPipeline
This commit is contained in:
Julien Chaumond
2020-01-30 18:15:42 -05:00
committed by GitHub
parent b43cb09aaa
commit 9fa836a73f
4 changed files with 201 additions and 18 deletions

View File

@@ -1,7 +1,8 @@
import unittest
from typing import Iterable
from typing import Iterable, List, Optional
from transformers import pipeline
from transformers.pipelines import Pipeline
from .utils import require_tf, require_torch
@@ -62,9 +63,25 @@ TEXT_CLASSIF_FINETUNED_MODELS = {
)
}
FILL_MASK_FINETUNED_MODELS = {
("distilroberta-base", "distilroberta-base", None),
}
TF_FILL_MASK_FINETUNED_MODELS = {
("distilroberta-base", "distilroberta-base", None),
}
class MonoColumnInputTestCase(unittest.TestCase):
def _test_mono_column_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):
def _test_mono_column_pipeline(
self,
nlp: Pipeline,
valid_inputs: List,
invalid_inputs: List,
output_keys: Iterable[str],
expected_multi_result: Optional[List] = None,
expected_check_keys: Optional[List[str]] = None,
):
self.assertIsNotNone(nlp)
mono_result = nlp(valid_inputs[0])
@@ -81,6 +98,13 @@ class MonoColumnInputTestCase(unittest.TestCase):
self.assertIsInstance(multi_result, list)
self.assertIsInstance(multi_result[0], (dict, list))
if expected_multi_result is not None:
for result, expect in zip(multi_result, expected_multi_result):
for key in expected_check_keys or []:
self.assertEqual(
set([o[key] for o in result]), set([o[key] for o in expect]),
)
if isinstance(multi_result[0], list):
multi_result = multi_result[0]
@@ -110,7 +134,7 @@ class MonoColumnInputTestCase(unittest.TestCase):
@require_torch
def test_sentiment_analysis(self):
mandatory_keys = {"label"}
mandatory_keys = {"label", "score"}
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
invalid_inputs = [None]
for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS:
@@ -119,7 +143,7 @@ class MonoColumnInputTestCase(unittest.TestCase):
@require_tf
def test_tf_sentiment_analysis(self):
mandatory_keys = {"label"}
mandatory_keys = {"label", "score"}
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
invalid_inputs = [None]
for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS:
@@ -127,21 +151,87 @@ class MonoColumnInputTestCase(unittest.TestCase):
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
@require_torch
def test_features_extraction(self):
def test_feature_extraction(self):
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
invalid_inputs = [None]
for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS:
nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
nlp = pipeline(task="feature-extraction", model=model, config=config, tokenizer=tokenizer)
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
@require_tf
def test_tf_features_extraction(self):
def test_tf_feature_extraction(self):
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
invalid_inputs = [None]
for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS:
nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
nlp = pipeline(task="feature-extraction", model=model, config=config, tokenizer=tokenizer)
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
@require_torch
def test_fill_mask(self):
mandatory_keys = {"sequence", "score", "token"}
valid_inputs = [
"My name is <mask>",
"The largest city in France is <mask>",
]
invalid_inputs = [None]
expected_multi_result = [
[
{"score": 0.008698059245944023, "sequence": "<s>My name is John</s>", "token": 610},
{"score": 0.007750614080578089, "sequence": "<s>My name is Chris</s>", "token": 1573},
],
[
{"score": 0.2721288502216339, "sequence": "<s>The largest city in France is Paris</s>", "token": 2201},
{
"score": 0.19764970242977142,
"sequence": "<s>The largest city in France is Lyon</s>",
"token": 12790,
},
],
]
for tokenizer, model, config in FILL_MASK_FINETUNED_MODELS:
nlp = pipeline(task="fill-mask", model=model, config=config, tokenizer=tokenizer, topk=2)
self._test_mono_column_pipeline(
nlp,
valid_inputs,
invalid_inputs,
mandatory_keys,
expected_multi_result=expected_multi_result,
expected_check_keys=["sequence"],
)
@require_tf
def test_tf_fill_mask(self):
mandatory_keys = {"sequence", "score", "token"}
valid_inputs = [
"My name is <mask>",
"The largest city in France is <mask>",
]
invalid_inputs = [None]
expected_multi_result = [
[
{"score": 0.008698059245944023, "sequence": "<s>My name is John</s>", "token": 610},
{"score": 0.007750614080578089, "sequence": "<s>My name is Chris</s>", "token": 1573},
],
[
{"score": 0.2721288502216339, "sequence": "<s>The largest city in France is Paris</s>", "token": 2201},
{
"score": 0.19764970242977142,
"sequence": "<s>The largest city in France is Lyon</s>",
"token": 12790,
},
],
]
for tokenizer, model, config in TF_FILL_MASK_FINETUNED_MODELS:
nlp = pipeline(task="fill-mask", model=model, config=config, tokenizer=tokenizer, topk=2)
self._test_mono_column_pipeline(
nlp,
valid_inputs,
invalid_inputs,
mandatory_keys,
expected_multi_result=expected_multi_result,
expected_check_keys=["sequence"],
)
class MultiColumnInputTestCase(unittest.TestCase):
def _test_multicolumn_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):