From fd85734e0efe281887020f51d26ae6cd1df493e9 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Wed, 28 Jul 2021 09:38:12 -0400 Subject: [PATCH] Add option to set max_len in run_ner (#12929) --- examples/pytorch/token-classification/run_ner.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index c9cf2b0890..824f98574e 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -123,6 +123,13 @@ class DataTrainingArguments: default=None, metadata={"help": "The number of processes to use for the preprocessing."}, ) + max_seq_length: int = field( + default=None, + metadata={ + "help": "The maximum total input sequence length after tokenization. If set, sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) pad_to_max_length: bool = field( default=False, metadata={ @@ -358,6 +365,7 @@ def main(): examples[text_column_name], padding=padding, truncation=True, + max_length=data_args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, )