added code to raise value error for bert tokenizer for covert_tokens_to_indices

2018-12-18 14:41:30 +00:00
parent 786cc41299
commit 78cf7b4ab4
2 changed files with 53 additions and 13 deletions
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
@@ -44,12 +44,30 @@ class TokenizationTest(unittest.TestCase):
        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])

+    def test_full_tokenizer_raises_error_for_long_sequences(self):
+        vocab_tokens = [
+            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+            "##ing", ","
+        ]
+        with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+            vocab_file = vocab_writer.name
+
+        tokenizer = BertTokenizer(vocab_file, max_len=10)
+        os.remove(vocab_file)
+        tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time")
+        indices = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(indices, [0 for _ in range(10)])
+
+        tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time .")
+        self.assertRaises(ValueError, tokenizer.convert_tokens_to_ids, tokens)
+
    def test_chinese(self):
        tokenizer = BasicTokenizer()
-    
+
        self.assertListEqual(
            tokenizer.tokenize(u"ah\u535A\u63A8zz"),
-            [u"ah", u"\u535A", u"\u63A8", u"zz"])  
+            [u"ah", u"\u535A", u"\u63A8", u"zz"])

    def test_basic_tokenizer_lower(self):
        tokenizer = BasicTokenizer(do_lower_case=True)