Add min and max question length options to TapasTokenizer (#12803)
* Add min and max question length option to the tokenizer * Add corresponding test
This commit is contained in:
@@ -1076,6 +1076,37 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertListEqual(new_encoded_inputs, dropped_encoded_inputs)
|
||||
self.assertLessEqual(len(new_encoded_inputs), 20)
|
||||
|
||||
@slow
|
||||
def test_min_max_question_length(self):
|
||||
data = {
|
||||
"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
|
||||
"Age": ["56", "45", "59"],
|
||||
"Number of movies": ["87", "53", "69"],
|
||||
"Date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"],
|
||||
}
|
||||
queries = "When was Brad Pitt born?"
|
||||
table = pd.DataFrame.from_dict(data)
|
||||
|
||||
# test max_question_length
|
||||
tokenizer = TapasTokenizer.from_pretrained("lysandre/tapas-temporary-repo", max_question_length=2)
|
||||
|
||||
encoding = tokenizer(table=table, queries=queries)
|
||||
|
||||
# query should not be tokenized as it's longer than the specified max_question_length
|
||||
expected_results = [101, 102]
|
||||
|
||||
self.assertListEqual(encoding.input_ids[:2], expected_results)
|
||||
|
||||
# test min_question_length
|
||||
tokenizer = TapasTokenizer.from_pretrained("lysandre/tapas-temporary-repo", min_question_length=30)
|
||||
|
||||
encoding = tokenizer(table=table, queries=queries)
|
||||
|
||||
# query should not be tokenized as it's shorter than the specified min_question_length
|
||||
expected_results = [101, 102]
|
||||
|
||||
self.assertListEqual(encoding.input_ids[:2], expected_results)
|
||||
|
||||
@is_pt_tf_cross_test
|
||||
def test_batch_encode_plus_tensors(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
|
||||
Reference in New Issue
Block a user