update docstring of BERT tokenizer to reflect do_wordpiece_only

This commit is contained in:
John Hewitt
2019-02-27 14:50:41 -08:00
parent e14c6b52e3
commit 4d1ad83236

View File

@@ -79,8 +79,16 @@ class BertTokenizer(object):
"""Constructs a BertTokenizer. """Constructs a BertTokenizer.
Args: Args:
do_lower_case: Whether to lower case the input. vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_wordpiece_only: Whether to do basic tokenization before wordpiece. do_lower_case: Whether to lower case the input
Only has an effect when do_wordpiece_only=False
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
""" """
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
raise ValueError( raise ValueError(