NER: Add new WNUT’17 example (#4681)

* ner: add preprocessing script for examples that splits longer sentences * ner: example shell scripts use local preprocessing now * ner: add new example section for WNUT’17 NER task. Remove old English CoNLL-03 results * ner: satisfy black and isort
2020-06-05 01:13:17 +02:00
parent 0e1869cc28
commit 2a4b9e09c0
4 changed files with 167 additions and 31 deletions
--- a/examples/token-classification/scripts/preprocess.py
+++ b/examples/token-classification/scripts/preprocess.py
@@ -0,0 +1,41 @@
+import sys
+
+from transformers import AutoTokenizer
+
+
+dataset = sys.argv[1]
+model_name_or_path = sys.argv[2]
+max_len = int(sys.argv[3])
+
+subword_len_counter = 0
+
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+max_len -= tokenizer.num_special_tokens_to_add()
+
+with open(dataset, "rt") as f_p:
+    for line in f_p:
+        line = line.rstrip()
+
+        if not line:
+            print(line)
+            subword_len_counter = 0
+            continue
+
+        token = line.split()[0]
+
+        current_subwords_len = len(tokenizer.tokenize(token))
+
+        # Token contains strange control characters like \x96 or \x95
+        # Just filter out the complete line
+        if current_subwords_len == 0:
+            continue
+
+        if (subword_len_counter + current_subwords_len) > max_len:
+            print("")
+            print(line)
+            subword_len_counter = current_subwords_len
+            continue
+
+        subword_len_counter += current_subwords_len
+
+        print(line)