fill_mask helper (#2576)
* fill_mask helper * [poc] FillMaskPipeline * Revert "[poc] FillMaskPipeline" This reverts commit 67eeea55b0f97b46c2b828de0f4ee97d87338335. * Revert "fill_mask helper" This reverts commit cacc17b884e14bb6b07989110ffe884ad9e36eaa. * README: clarify that Pipelines can also do text-classification cf. question at the AI&ML meetup last week, @mfuntowicz * Fix test: test feature-extraction pipeline * Test tweaks * Slight refactor of existing pipeline (in preparation of new FillMaskPipeline) * Extraneous doc * More robust way of doing this @mfuntowicz as we don't rely on the model name anymore (see AutoConfig) * Also add RobertaConfig as a quickfix for wrong token_type_ids * cs * [BIG] FillMaskPipeline
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
import unittest
|
||||
from typing import Iterable
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
from transformers import pipeline
|
||||
from transformers.pipelines import Pipeline
|
||||
|
||||
from .utils import require_tf, require_torch
|
||||
|
||||
@@ -62,9 +63,25 @@ TEXT_CLASSIF_FINETUNED_MODELS = {
|
||||
)
|
||||
}
|
||||
|
||||
FILL_MASK_FINETUNED_MODELS = {
|
||||
("distilroberta-base", "distilroberta-base", None),
|
||||
}
|
||||
|
||||
TF_FILL_MASK_FINETUNED_MODELS = {
|
||||
("distilroberta-base", "distilroberta-base", None),
|
||||
}
|
||||
|
||||
|
||||
class MonoColumnInputTestCase(unittest.TestCase):
|
||||
def _test_mono_column_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):
|
||||
def _test_mono_column_pipeline(
|
||||
self,
|
||||
nlp: Pipeline,
|
||||
valid_inputs: List,
|
||||
invalid_inputs: List,
|
||||
output_keys: Iterable[str],
|
||||
expected_multi_result: Optional[List] = None,
|
||||
expected_check_keys: Optional[List[str]] = None,
|
||||
):
|
||||
self.assertIsNotNone(nlp)
|
||||
|
||||
mono_result = nlp(valid_inputs[0])
|
||||
@@ -81,6 +98,13 @@ class MonoColumnInputTestCase(unittest.TestCase):
|
||||
self.assertIsInstance(multi_result, list)
|
||||
self.assertIsInstance(multi_result[0], (dict, list))
|
||||
|
||||
if expected_multi_result is not None:
|
||||
for result, expect in zip(multi_result, expected_multi_result):
|
||||
for key in expected_check_keys or []:
|
||||
self.assertEqual(
|
||||
set([o[key] for o in result]), set([o[key] for o in expect]),
|
||||
)
|
||||
|
||||
if isinstance(multi_result[0], list):
|
||||
multi_result = multi_result[0]
|
||||
|
||||
@@ -110,7 +134,7 @@ class MonoColumnInputTestCase(unittest.TestCase):
|
||||
|
||||
@require_torch
|
||||
def test_sentiment_analysis(self):
|
||||
mandatory_keys = {"label"}
|
||||
mandatory_keys = {"label", "score"}
|
||||
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
|
||||
invalid_inputs = [None]
|
||||
for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS:
|
||||
@@ -119,7 +143,7 @@ class MonoColumnInputTestCase(unittest.TestCase):
|
||||
|
||||
@require_tf
|
||||
def test_tf_sentiment_analysis(self):
|
||||
mandatory_keys = {"label"}
|
||||
mandatory_keys = {"label", "score"}
|
||||
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
|
||||
invalid_inputs = [None]
|
||||
for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS:
|
||||
@@ -127,21 +151,87 @@ class MonoColumnInputTestCase(unittest.TestCase):
|
||||
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
|
||||
|
||||
@require_torch
|
||||
def test_features_extraction(self):
|
||||
def test_feature_extraction(self):
|
||||
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
|
||||
invalid_inputs = [None]
|
||||
for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS:
|
||||
nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
|
||||
nlp = pipeline(task="feature-extraction", model=model, config=config, tokenizer=tokenizer)
|
||||
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
|
||||
|
||||
@require_tf
|
||||
def test_tf_features_extraction(self):
|
||||
def test_tf_feature_extraction(self):
|
||||
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
|
||||
invalid_inputs = [None]
|
||||
for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS:
|
||||
nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer)
|
||||
nlp = pipeline(task="feature-extraction", model=model, config=config, tokenizer=tokenizer)
|
||||
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
|
||||
|
||||
@require_torch
|
||||
def test_fill_mask(self):
|
||||
mandatory_keys = {"sequence", "score", "token"}
|
||||
valid_inputs = [
|
||||
"My name is <mask>",
|
||||
"The largest city in France is <mask>",
|
||||
]
|
||||
invalid_inputs = [None]
|
||||
expected_multi_result = [
|
||||
[
|
||||
{"score": 0.008698059245944023, "sequence": "<s>My name is John</s>", "token": 610},
|
||||
{"score": 0.007750614080578089, "sequence": "<s>My name is Chris</s>", "token": 1573},
|
||||
],
|
||||
[
|
||||
{"score": 0.2721288502216339, "sequence": "<s>The largest city in France is Paris</s>", "token": 2201},
|
||||
{
|
||||
"score": 0.19764970242977142,
|
||||
"sequence": "<s>The largest city in France is Lyon</s>",
|
||||
"token": 12790,
|
||||
},
|
||||
],
|
||||
]
|
||||
for tokenizer, model, config in FILL_MASK_FINETUNED_MODELS:
|
||||
nlp = pipeline(task="fill-mask", model=model, config=config, tokenizer=tokenizer, topk=2)
|
||||
self._test_mono_column_pipeline(
|
||||
nlp,
|
||||
valid_inputs,
|
||||
invalid_inputs,
|
||||
mandatory_keys,
|
||||
expected_multi_result=expected_multi_result,
|
||||
expected_check_keys=["sequence"],
|
||||
)
|
||||
|
||||
@require_tf
|
||||
def test_tf_fill_mask(self):
|
||||
mandatory_keys = {"sequence", "score", "token"}
|
||||
valid_inputs = [
|
||||
"My name is <mask>",
|
||||
"The largest city in France is <mask>",
|
||||
]
|
||||
invalid_inputs = [None]
|
||||
expected_multi_result = [
|
||||
[
|
||||
{"score": 0.008698059245944023, "sequence": "<s>My name is John</s>", "token": 610},
|
||||
{"score": 0.007750614080578089, "sequence": "<s>My name is Chris</s>", "token": 1573},
|
||||
],
|
||||
[
|
||||
{"score": 0.2721288502216339, "sequence": "<s>The largest city in France is Paris</s>", "token": 2201},
|
||||
{
|
||||
"score": 0.19764970242977142,
|
||||
"sequence": "<s>The largest city in France is Lyon</s>",
|
||||
"token": 12790,
|
||||
},
|
||||
],
|
||||
]
|
||||
for tokenizer, model, config in TF_FILL_MASK_FINETUNED_MODELS:
|
||||
nlp = pipeline(task="fill-mask", model=model, config=config, tokenizer=tokenizer, topk=2)
|
||||
self._test_mono_column_pipeline(
|
||||
nlp,
|
||||
valid_inputs,
|
||||
invalid_inputs,
|
||||
mandatory_keys,
|
||||
expected_multi_result=expected_multi_result,
|
||||
expected_check_keys=["sequence"],
|
||||
)
|
||||
|
||||
|
||||
class MultiColumnInputTestCase(unittest.TestCase):
|
||||
def _test_multicolumn_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):
|
||||
|
||||
Reference in New Issue
Block a user