update docstring of BERT tokenizer to reflect do_wordpiece_only

2019-02-27 14:50:41 -08:00
parent e14c6b52e3
commit 4d1ad83236
1 changed files with 10 additions and 2 deletions
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -79,8 +79,16 @@ class BertTokenizer(object):
        """Constructs a BertTokenizer.
        Args:
-          do_lower_case: Whether to lower case the input.
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
-          do_wordpiece_only: Whether to do basic tokenization before wordpiece.
+          do_lower_case: Whether to lower case the input
                         Only has an effect when do_wordpiece_only=False
          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
          never_split: List of tokens which will never be split during tokenization.
                         Only has an effect when do_wordpiece_only=False
        """
        if not os.path.isfile(vocab_file):
            raise ValueError(