From 4d1ad832368254310eae058dae4dc07e7ed57a6e Mon Sep 17 00:00:00 2001 From: John Hewitt Date: Wed, 27 Feb 2019 14:50:41 -0800 Subject: [PATCH] update docstring of BERT tokenizer to reflect do_wordpiece_only --- pytorch_pretrained_bert/tokenization.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index 9ee8be2039..4ea8de6f70 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -79,8 +79,16 @@ class BertTokenizer(object): """Constructs a BertTokenizer. Args: - do_lower_case: Whether to lower case the input. - do_wordpiece_only: Whether to do basic tokenization before wordpiece. + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input + Only has an effect when do_wordpiece_only=False + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + max_len: An artificial maximum length to truncate tokenized sequences to; + Effective maximum length is always the minimum of this + value (if specified) and the underlying BERT model's + sequence length. + never_split: List of tokens which will never be split during tokenization. + Only has an effect when do_wordpiece_only=False """ if not os.path.isfile(vocab_file): raise ValueError(