From 2dd8f524f5ad475afce6ee01258d992593e69bea Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 6 Mar 2019 10:10:41 +0100 Subject: [PATCH] removing test for long sequences error following #337 --- tests/tokenization_test.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/tests/tokenization_test.py b/tests/tokenization_test.py index 6a14e05ae8..78e145ffd2 100644 --- a/tests/tokenization_test.py +++ b/tests/tokenization_test.py @@ -46,24 +46,6 @@ class TokenizationTest(unittest.TestCase): self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) - def test_full_tokenizer_raises_error_for_long_sequences(self): - vocab_tokens = [ - "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", - "##ing", "," - ] - with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer: - vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - vocab_file = vocab_writer.name - - tokenizer = BertTokenizer(vocab_file, max_len=10) - os.remove(vocab_file) - tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time") - indices = tokenizer.convert_tokens_to_ids(tokens) - self.assertListEqual(indices, [0 for _ in range(10)]) - - tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time .") - self.assertRaises(ValueError, tokenizer.convert_tokens_to_ids, tokens) - def test_chinese(self): tokenizer = BasicTokenizer()