From d2183a46fbb2172e1cb5d883c9a611010b608979 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 6 Jan 2022 15:45:41 +0100 Subject: [PATCH] Remove old asserts. (#15012) --- tests/test_tokenization_common.py | 109 ++++++++++++++++-------------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 1f6087ae4c..3a6da1e122 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -893,7 +893,7 @@ class TokenizerTesterMixin: sequence = tokenizer.encode(seq_0, add_special_tokens=False) total_length = len(sequence) - assert total_length > 4, "Issue with the testing sequence, please update it it's too short" + self.assertGreater(total_length, 4, "Issue with the testing sequence, please update it it's too short") # Test with max model input length model_max_length = tokenizer.model_max_length @@ -902,9 +902,9 @@ class TokenizerTesterMixin: sequence1 = tokenizer(seq_1, add_special_tokens=False) total_length1 = len(sequence1["input_ids"]) - assert ( - total_length1 > model_max_length - ), "Issue with the testing sequence, please update it it's too short" + self.assertGreater( + total_length1, model_max_length, "Issue with the testing sequence, please update it it's too short" + ) # Simple padding_strategies = ( @@ -989,7 +989,7 @@ class TokenizerTesterMixin: ids = None seq0_tokens = tokenizer.encode(seq_0, add_special_tokens=False) - assert len(seq0_tokens) > 2 + stride + self.assertGreater(len(seq0_tokens), 2 + stride) seq_1 = "This is another sentence to be encoded." seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False) @@ -998,7 +998,7 @@ class TokenizerTesterMixin: seq_1 = tokenizer.decode(seq1_tokens, clean_up_tokenization_spaces=False) seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False) - assert len(seq1_tokens) > 2 + stride + self.assertGreater(len(seq1_tokens), 2 + stride) smallest = seq1_tokens if len(seq0_tokens) > len(seq1_tokens) else seq0_tokens @@ -1010,14 +1010,18 @@ class TokenizerTesterMixin: model_max_length = tokenizer.model_max_length self.assertEqual(model_max_length, 100) seq_2 = seq_0 * model_max_length - assert len(seq_2) > model_max_length + self.assertGreater(len(seq_2), model_max_length) sequence1 = tokenizer(seq_1, add_special_tokens=False) total_length1 = len(sequence1["input_ids"]) sequence2 = tokenizer(seq_2, seq_1, add_special_tokens=False) total_length2 = len(sequence2["input_ids"]) - assert total_length1 < model_max_length - 10, "Issue with the testing sequence, please update it." - assert total_length2 > model_max_length, "Issue with the testing sequence, please update it." + self.assertLess( + total_length1, model_max_length - 10, "Issue with the testing sequence, please update it." + ) + self.assertGreater( + total_length2, model_max_length, "Issue with the testing sequence, please update it." + ) # Simple padding_strategies = ( @@ -1279,7 +1283,7 @@ class TokenizerTesterMixin: # # Test first masked sequence # encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False) # encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False) - # assert len(encoded_masked) == len(encoded_0) + # self.assertEqual(len(encoded_masked), len(encoded_0)) # mask_loc = encoded_masked.index(mask_ind) # encoded_masked[mask_loc] = encoded_0[mask_loc] @@ -1288,7 +1292,7 @@ class TokenizerTesterMixin: # # Test second masked sequence # encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False) # encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False) - # assert len(encoded_masked) == len(encoded_1) + # self.assertEqual(len(encoded_masked), len(encoded_1)) # mask_loc = encoded_masked.index(mask_ind) # encoded_masked[mask_loc] = encoded_1[mask_loc] @@ -1356,8 +1360,8 @@ class TokenizerTesterMixin: sequence, max_length=sequence_length + padding_size, padding="max_length" ) padded_sequence_length = len(padded_sequence) - assert sequence_length + padding_size == padded_sequence_length - assert encoded_sequence + [padding_idx] * padding_size == padded_sequence + self.assertEqual(sequence_length + padding_size, padded_sequence_length) + self.assertEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence) # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True tokenizer.padding_side = "left" @@ -1367,8 +1371,8 @@ class TokenizerTesterMixin: sequence, max_length=sequence_length + padding_size, padding="max_length" ) padded_sequence_length = len(padded_sequence) - assert sequence_length + padding_size == padded_sequence_length - assert [padding_idx] * padding_size + encoded_sequence == padded_sequence + self.assertEqual(sequence_length + padding_size, padded_sequence_length) + self.assertEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence) # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding' encoded_sequence = tokenizer.encode(sequence) @@ -1377,26 +1381,26 @@ class TokenizerTesterMixin: tokenizer.padding_side = "right" padded_sequence_right = tokenizer.encode(sequence, padding=True) padded_sequence_right_length = len(padded_sequence_right) - assert sequence_length == padded_sequence_right_length - assert encoded_sequence == padded_sequence_right + self.assertEqual(sequence_length, padded_sequence_right_length) + self.assertEqual(encoded_sequence, padded_sequence_right) tokenizer.padding_side = "left" padded_sequence_left = tokenizer.encode(sequence, padding="longest") padded_sequence_left_length = len(padded_sequence_left) - assert sequence_length == padded_sequence_left_length - assert encoded_sequence == padded_sequence_left + self.assertEqual(sequence_length, padded_sequence_left_length) + self.assertEqual(encoded_sequence, padded_sequence_left) tokenizer.padding_side = "right" padded_sequence_right = tokenizer.encode(sequence) padded_sequence_right_length = len(padded_sequence_right) - assert sequence_length == padded_sequence_right_length - assert encoded_sequence == padded_sequence_right + self.assertEqual(sequence_length, padded_sequence_right_length) + self.assertEqual(encoded_sequence, padded_sequence_right) tokenizer.padding_side = "left" padded_sequence_left = tokenizer.encode(sequence, padding=False) padded_sequence_left_length = len(padded_sequence_left) - assert sequence_length == padded_sequence_left_length - assert encoded_sequence == padded_sequence_left + self.assertEqual(sequence_length, padded_sequence_left_length) + self.assertEqual(encoded_sequence, padded_sequence_left) def test_right_and_left_truncation(self): tokenizers = self.get_tokenizers(do_lower_case=False) @@ -1478,8 +1482,8 @@ class TokenizerTesterMixin: sequence, max_length=sequence_length + padding_size, pad_to_max_length=True ) padded_sequence_length = len(padded_sequence) - assert sequence_length + padding_size == padded_sequence_length - assert encoded_sequence + [padding_idx] * padding_size == padded_sequence + self.assertEqual(sequence_length + padding_size, padded_sequence_length) + self.assertEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence) # Check that nothing is done when a maximum length is not specified encoded_sequence = tokenizer.encode(sequence) @@ -1488,8 +1492,8 @@ class TokenizerTesterMixin: tokenizer.padding_side = "right" padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True) padded_sequence_right_length = len(padded_sequence_right) - assert sequence_length == padded_sequence_right_length - assert encoded_sequence == padded_sequence_right + self.assertEqual(sequence_length, padded_sequence_right_length) + self.assertEqual(encoded_sequence, padded_sequence_right) def test_padding_to_multiple_of(self): tokenizers = self.get_tokenizers() @@ -1575,9 +1579,9 @@ class TokenizerTesterMixin: not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] not_padded_sequence_length = len(not_padded_input_ids) - assert sequence_length == not_padded_sequence_length - assert input_ids == not_padded_input_ids - assert special_tokens_mask == not_padded_special_tokens_mask + self.assertEqual(sequence_length, not_padded_sequence_length) + self.assertEqual(input_ids, not_padded_input_ids) + self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask) not_padded_sequence = tokenizer.encode_plus( sequence, @@ -1589,9 +1593,9 @@ class TokenizerTesterMixin: not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] not_padded_sequence_length = len(not_padded_input_ids) - assert sequence_length == not_padded_sequence_length - assert input_ids == not_padded_input_ids - assert special_tokens_mask == not_padded_special_tokens_mask + self.assertEqual(sequence_length, not_padded_sequence_length) + self.assertEqual(input_ids, not_padded_input_ids) + self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask) # Test right padding tokenizer.padding_side = "right" @@ -1607,9 +1611,9 @@ class TokenizerTesterMixin: right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] right_padded_sequence_length = len(right_padded_input_ids) - assert sequence_length + padding_size == right_padded_sequence_length - assert input_ids + [padding_idx] * padding_size == right_padded_input_ids - assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask + self.assertEqual(sequence_length + padding_size, right_padded_sequence_length) + self.assertEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids) + self.assertEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask) # Test left padding tokenizer.padding_side = "left" @@ -1623,25 +1627,29 @@ class TokenizerTesterMixin: left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) - assert sequence_length + padding_size == left_padded_sequence_length - assert [padding_idx] * padding_size + input_ids == left_padded_input_ids - assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask + self.assertEqual(sequence_length + padding_size, left_padded_sequence_length) + self.assertEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids) + self.assertEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask) if "token_type_ids" in tokenizer.model_input_names: token_type_ids = encoded_sequence["token_type_ids"] left_padded_token_type_ids = left_padded_sequence["token_type_ids"] right_padded_token_type_ids = right_padded_sequence["token_type_ids"] - assert token_type_ids + [token_type_padding_idx] * padding_size == right_padded_token_type_ids - assert [token_type_padding_idx] * padding_size + token_type_ids == left_padded_token_type_ids + self.assertEqual( + token_type_ids + [token_type_padding_idx] * padding_size, right_padded_token_type_ids + ) + self.assertEqual( + [token_type_padding_idx] * padding_size + token_type_ids, left_padded_token_type_ids + ) if "attention_mask" in tokenizer.model_input_names: attention_mask = encoded_sequence["attention_mask"] right_padded_attention_mask = right_padded_sequence["attention_mask"] left_padded_attention_mask = left_padded_sequence["attention_mask"] - assert attention_mask + [0] * padding_size == right_padded_attention_mask - assert [0] * padding_size + attention_mask == left_padded_attention_mask + self.assertEqual(attention_mask + [0] * padding_size, right_padded_attention_mask) + self.assertEqual([0] * padding_size + attention_mask, left_padded_attention_mask) def test_separate_tokenizers(self): # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when @@ -1652,9 +1660,9 @@ class TokenizerTesterMixin: for tokenizer, new_tokenizer in zip(tokenizers, new_tokenizers): with self.subTest(f"{tokenizer.__class__.__name__}"): - assert tokenizer.init_kwargs["random_argument"] is True - assert tokenizer.init_kwargs["random_argument"] is True - assert new_tokenizer.init_kwargs["random_argument"] is False + self.assertTrue(tokenizer.init_kwargs["random_argument"]) + self.assertTrue(tokenizer.init_kwargs["random_argument"]) + self.assertFalse(new_tokenizer.init_kwargs["random_argument"]) def test_get_vocab(self): tokenizers = self.get_tokenizers(do_lower_case=False) @@ -2119,11 +2127,8 @@ class TokenizerTesterMixin: # Make sure the model contains at least the full vocabulary size in its embedding matrix is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight") - assert ( - (model.get_input_embeddings().weight.shape[0] >= len(tokenizer)) - if is_using_common_embeddings - else True - ) + if is_using_common_embeddings: + self.assertGreaterEqual(model.get_input_embeddings().weight.shape[0], len(tokenizer)) # Build sequence first_ten_tokens = list(tokenizer.get_vocab().keys())[:10] @@ -2170,7 +2175,7 @@ class TokenizerTesterMixin: model = model_class(config) # Make sure the model contains at least the full vocabulary size in its embedding matrix - assert model.config.vocab_size >= len(tokenizer) + self.assertGreaterEqual(model.config.vocab_size, len(tokenizer)) # Build sequence first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]