* TF Tapas first commit

* updated docs

* updated logger message

* updated pytorch weight conversion
script to support scalar array

* added use_cache to tapas model config to
work properly with tf input_processing

* 1. rm embeddings_sum
2. added # Copied
3. + TFTapasMLMHead
4. and lot other small fixes

* updated docs

* + test for tapas

* updated testing_utils to check
is_tensorflow_probability_available

* converted model logits post processing using
numpy to work with both PT and TF models

* + TFAutoModelForTableQuestionAnswering

* added TF support

* added test for
TFAutoModelForTableQuestionAnswering

* added test for
TFAutoModelForTableQuestionAnswering pipeline

* updated auto model docs

* fixed typo in import

* added tensorflow_probability to run tests

* updated MLM head

* updated tapas.rst with TF  model docs

* fixed optimizer import in docs

* updated convert to np
data from pt model is not
`transformers.tokenization_utils_base.BatchEncoding`
after pipeline upgrade

* updated pipeline:
1. with torch.no_gard removed, pipeline forward handles
2. token_type_ids converted to numpy

* updated docs.

* removed `use_cache` from config

* removed floats_tensor

* updated code comment

* updated Copyright Year and
logits_aggregation Optional

* updated docs and comments

* updated docstring

* fixed model weight loading

* make fixup

* fix indentation

* added tf slow pipeline test

* pip upgrade

* upgrade python to 3.7

* removed from_pt from tests

* revert commit f18cfa9
This commit is contained in:
Kamal Raj
2021-11-30 15:37:55 +05:30
committed by GitHub
parent 6fc38adff2
commit c468a87a69
19 changed files with 4324 additions and 75 deletions

View File

@@ -17,8 +17,14 @@ import copy
import tempfile
import unittest
from transformers import CONFIG_MAPPING, AutoConfig, BertConfig, GPT2Config, T5Config, is_tf_available
from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, require_tf, slow
from transformers import CONFIG_MAPPING, AutoConfig, BertConfig, GPT2Config, T5Config, TapasConfig, is_tf_available
from transformers.testing_utils import (
DUMMY_UNKNOWN_IDENTIFIER,
SMALL_MODEL_IDENTIFIER,
require_tensorflow_probability,
require_tf,
slow,
)
from .test_modeling_bert import BertModelTester
@@ -32,6 +38,7 @@ if is_tf_available():
TFAutoModelForQuestionAnswering,
TFAutoModelForSeq2SeqLM,
TFAutoModelForSequenceClassification,
TFAutoModelForTableQuestionAnswering,
TFAutoModelForTokenClassification,
TFAutoModelWithLMHead,
TFBertForMaskedLM,
@@ -44,6 +51,7 @@ if is_tf_available():
TFGPT2LMHeadModel,
TFRobertaForMaskedLM,
TFT5ForConditionalGeneration,
TFTapasForQuestionAnswering,
)
from transformers.models.auto.modeling_tf_auto import (
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
@@ -52,6 +60,7 @@ if is_tf_available():
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
@@ -59,6 +68,7 @@ if is_tf_available():
from transformers.models.bert.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.models.gpt2.modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.models.t5.modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.models.tapas.modeling_tf_tapas import TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST
class NewModelConfig(BertConfig):
@@ -176,6 +186,21 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForQuestionAnswering)
@slow
@require_tensorflow_probability
def test_table_question_answering_model_from_pretrained(self):
for model_name in TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST[5:6]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, TapasConfig)
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_name)
model, loading_info = TFAutoModelForTableQuestionAnswering.from_pretrained(
model_name, output_loading_info=True
)
self.assertIsNotNone(model)
self.assertIsInstance(model, TFTapasForQuestionAnswering)
def test_from_pretrained_identifier(self):
model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
self.assertIsInstance(model, TFBertForMaskedLM)
@@ -210,6 +235,7 @@ class TFAutoModelTest(unittest.TestCase):
TF_MODEL_MAPPING,
TF_MODEL_FOR_PRETRAINING_MAPPING,
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,

File diff suppressed because it is too large Load Diff

View File

@@ -19,11 +19,13 @@ from transformers import (
AutoModelForTableQuestionAnswering,
AutoTokenizer,
TableQuestionAnsweringPipeline,
TFAutoModelForTableQuestionAnswering,
pipeline,
)
from transformers.testing_utils import (
is_pipeline_test,
require_pandas,
require_tensorflow_probability,
require_tf,
require_torch,
require_torch_scatter,
@@ -33,6 +35,7 @@ from transformers.testing_utils import (
from .test_pipelines_common import PipelineTestCaseMeta
@require_tensorflow_probability
@require_torch_scatter
@require_torch
@require_pandas
@@ -43,9 +46,105 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
@require_tf
@unittest.skip("Table question answering not implemented in TF")
def test_small_model_tf(self):
pass
model_id = "lysandre/tiny-tapas-random-wtq"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
self.assertIsInstance(model.config.aggregation_labels, dict)
self.assertIsInstance(model.config.no_aggregation_label_index, int)
table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query="how many movies has george clooney played in?",
)
self.assertEqual(
outputs,
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
)
self.assertEqual(
outputs,
[
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
],
)
outputs = table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
query=[
"What repository has the largest number of stars?",
"Given that the numbers of stars defines if a repository is active, what repository is the most active?",
"What is the number of repositories?",
"What is the average number of stars?",
"What is the total amount of stars?",
],
)
self.assertEqual(
outputs,
[
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
{"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
],
)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table=None)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table="")
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table={})
with self.assertRaises(ValueError):
table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
}
)
with self.assertRaises(ValueError):
table_querier(
query="",
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
with self.assertRaises(ValueError):
table_querier(
query=None,
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
@require_torch
def test_small_model_pt(self):
@@ -148,7 +247,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
},
)
def test_slow_tokenizer_sqa(self):
@require_torch
def test_slow_tokenizer_sqa_pt(self):
model_id = "lysandre/tiny-tapas-random-sqa"
model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -265,8 +365,126 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
},
)
@require_tf
def test_slow_tokenizer_sqa_tf(self):
model_id = "lysandre/tiny-tapas-random-sqa"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
inputs = {
"table": {
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
"query": ["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
}
sequential_outputs = table_querier(**inputs, sequential=True)
batch_outputs = table_querier(**inputs, sequential=False)
self.assertEqual(len(sequential_outputs), 3)
self.assertEqual(len(batch_outputs), 3)
self.assertEqual(sequential_outputs[0], batch_outputs[0])
self.assertNotEqual(sequential_outputs[1], batch_outputs[1])
# self.assertNotEqual(sequential_outputs[2], batch_outputs[2])
table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query="how many movies has george clooney played in?",
)
self.assertEqual(
outputs,
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
)
outputs = table_querier(
table={
"actors": ["brad pitt", "leonardo di caprio", "george clooney"],
"age": ["56", "45", "59"],
"number of movies": ["87", "53", "69"],
"date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
},
query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
)
self.assertEqual(
outputs,
[
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
{"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
],
)
outputs = table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
query=[
"What repository has the largest number of stars?",
"Given that the numbers of stars defines if a repository is active, what repository is the most active?",
"What is the number of repositories?",
"What is the average number of stars?",
"What is the total amount of stars?",
],
)
self.assertEqual(
outputs,
[
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
{"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
],
)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table=None)
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table="")
with self.assertRaises(ValueError):
table_querier(query="What does it do with empty context ?", table={})
with self.assertRaises(ValueError):
table_querier(
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
}
)
with self.assertRaises(ValueError):
table_querier(
query="",
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
with self.assertRaises(ValueError):
table_querier(
query=None,
table={
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
},
)
@slow
def test_integration_wtq(self):
def test_integration_wtq_pt(self):
table_querier = pipeline("table-question-answering")
data = {
@@ -310,7 +528,54 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
self.assertListEqual(results, expected_results)
@slow
def test_integration_sqa(self):
def test_integration_wtq_tf(self):
model_id = "google/tapas-base-finetuned-wtq"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
table_querier = pipeline("table-question-answering", model=model, tokenizer=tokenizer)
data = {
"Repository": ["Transformers", "Datasets", "Tokenizers"],
"Stars": ["36542", "4512", "3934"],
"Contributors": ["651", "77", "34"],
"Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
}
queries = [
"What repository has the largest number of stars?",
"Given that the numbers of stars defines if a repository is active, what repository is the most active?",
"What is the number of repositories?",
"What is the average number of stars?",
"What is the total amount of stars?",
]
results = table_querier(data, queries)
expected_results = [
{"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"},
{"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"},
{
"answer": "COUNT > Transformers, Datasets, Tokenizers",
"coordinates": [(0, 0), (1, 0), (2, 0)],
"cells": ["Transformers", "Datasets", "Tokenizers"],
"aggregator": "COUNT",
},
{
"answer": "AVERAGE > 36542, 4512, 3934",
"coordinates": [(0, 1), (1, 1), (2, 1)],
"cells": ["36542", "4512", "3934"],
"aggregator": "AVERAGE",
},
{
"answer": "SUM > 36542, 4512, 3934",
"coordinates": [(0, 1), (1, 1), (2, 1)],
"cells": ["36542", "4512", "3934"],
"aggregator": "SUM",
},
]
self.assertListEqual(results, expected_results)
@slow
def test_integration_sqa_pt(self):
table_querier = pipeline(
"table-question-answering",
model="google/tapas-base-finetuned-sqa",
@@ -331,3 +596,29 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
{"answer": "28 november 1967", "coordinates": [(2, 3)], "cells": ["28 november 1967"]},
]
self.assertListEqual(results, expected_results)
@slow
def test_integration_sqa_tf(self):
model_id = "google/tapas-base-finetuned-sqa"
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
table_querier = pipeline(
"table-question-answering",
model=model,
tokenizer=tokenizer,
)
data = {
"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
"Age": ["56", "45", "59"],
"Number of movies": ["87", "53", "69"],
"Date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
}
queries = ["How many movies has George Clooney played in?", "How old is he?", "What's his date of birth?"]
results = table_querier(data, queries, sequential=True)
expected_results = [
{"answer": "69", "coordinates": [(2, 2)], "cells": ["69"]},
{"answer": "59", "coordinates": [(2, 1)], "cells": ["59"]},
{"answer": "28 november 1967", "coordinates": [(2, 3)], "cells": ["28 november 1967"]},
]
self.assertListEqual(results, expected_results)