NER: Add new WNUT’17 example (#4681)
* ner: add preprocessing script for examples that splits longer sentences * ner: example shell scripts use local preprocessing now * ner: add new example section for WNUT’17 NER task. Remove old English CoNLL-03 results * ner: satisfy black and isort
This commit is contained in:
41
examples/token-classification/scripts/preprocess.py
Normal file
41
examples/token-classification/scripts/preprocess.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import sys
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
||||
dataset = sys.argv[1]
|
||||
model_name_or_path = sys.argv[2]
|
||||
max_len = int(sys.argv[3])
|
||||
|
||||
subword_len_counter = 0
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
max_len -= tokenizer.num_special_tokens_to_add()
|
||||
|
||||
with open(dataset, "rt") as f_p:
|
||||
for line in f_p:
|
||||
line = line.rstrip()
|
||||
|
||||
if not line:
|
||||
print(line)
|
||||
subword_len_counter = 0
|
||||
continue
|
||||
|
||||
token = line.split()[0]
|
||||
|
||||
current_subwords_len = len(tokenizer.tokenize(token))
|
||||
|
||||
# Token contains strange control characters like \x96 or \x95
|
||||
# Just filter out the complete line
|
||||
if current_subwords_len == 0:
|
||||
continue
|
||||
|
||||
if (subword_len_counter + current_subwords_len) > max_len:
|
||||
print("")
|
||||
print(line)
|
||||
subword_len_counter = current_subwords_len
|
||||
continue
|
||||
|
||||
subword_len_counter += current_subwords_len
|
||||
|
||||
print(line)
|
||||
Reference in New Issue
Block a user