update tokenization docstrings for #328
This commit is contained in:
@@ -104,16 +104,23 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
|
def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
|
||||||
unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
|
unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
|
||||||
mask_token="[MASK]", **kwargs):
|
mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs):
|
||||||
"""Constructs a BertTokenizer.
|
"""Constructs a BertTokenizer.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_file: Path to a one-wordpiece-per-line vocabulary file
|
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file
|
||||||
do_lower_case: Whether to lower case the input
|
**do_lower_case**: (`optional`) boolean (default True)
|
||||||
Only has an effect when do_wordpiece_only=False
|
Whether to lower case the input
|
||||||
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
|
Only has an effect when do_basic_tokenize=True
|
||||||
never_split: List of tokens which will never be split during tokenization.
|
**do_basic_tokenize**: (`optional`) boolean (default True)
|
||||||
Only has an effect when do_wordpiece_only=False
|
Whether to do basic tokenization before wordpiece.
|
||||||
|
**never_split**: (`optional`) list of string
|
||||||
|
List of tokens which will never be split during tokenization.
|
||||||
|
Only has an effect when do_basic_tokenize=True
|
||||||
|
**tokenize_chinese_chars**: (`optional`) boolean (default True)
|
||||||
|
Whether to tokenize Chinese characters.
|
||||||
|
This should likely be desactivated for Japanese:
|
||||||
|
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
|
||||||
"""
|
"""
|
||||||
super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
|
super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
|
||||||
pad_token=pad_token, cls_token=cls_token,
|
pad_token=pad_token, cls_token=cls_token,
|
||||||
@@ -127,8 +134,9 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
[(ids, tok) for tok, ids in self.vocab.items()])
|
[(ids, tok) for tok, ids in self.vocab.items()])
|
||||||
self.do_basic_tokenize = do_basic_tokenize
|
self.do_basic_tokenize = do_basic_tokenize
|
||||||
if do_basic_tokenize:
|
if do_basic_tokenize:
|
||||||
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
|
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
|
||||||
never_split=never_split)
|
never_split=never_split,
|
||||||
|
tokenize_chinese_chars=tokenize_chinese_chars)
|
||||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -196,21 +204,36 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
class BasicTokenizer(object):
|
class BasicTokenizer(object):
|
||||||
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
|
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
|
||||||
do_lower_case=True,
|
""" Constructs a BasicTokenizer.
|
||||||
never_split=None):
|
|
||||||
"""Constructs a BasicTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
do_lower_case: Whether to lower case the input.
|
**do_lower_case**: Whether to lower case the input.
|
||||||
|
**never_split**: (`optional`) list of str
|
||||||
|
Kept for backward compatibility purposes.
|
||||||
|
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
|
||||||
|
List of token not to split.
|
||||||
|
**tokenize_chinese_chars**: (`optional`) boolean (default True)
|
||||||
|
Whether to tokenize Chinese characters.
|
||||||
|
This should likely be desactivated for Japanese:
|
||||||
|
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
|
||||||
"""
|
"""
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = never_split
|
self.never_split = never_split
|
||||||
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None, tokenize_chinese_chars=True):
|
def tokenize(self, text, never_split=None):
|
||||||
"""Tokenizes a piece of text."""
|
""" Basic Tokenization of a piece of text.
|
||||||
|
Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
**never_split**: (`optional`) list of str
|
||||||
|
Kept for backward compatibility purposes.
|
||||||
|
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
|
||||||
|
List of token not to split.
|
||||||
|
"""
|
||||||
never_split = self.never_split + (never_split if never_split is not None else [])
|
never_split = self.never_split + (never_split if never_split is not None else [])
|
||||||
text = self._clean_text(text)
|
text = self._clean_text(text)
|
||||||
# This was added on November 1st, 2018 for the multilingual and Chinese
|
# This was added on November 1st, 2018 for the multilingual and Chinese
|
||||||
@@ -219,7 +242,7 @@ class BasicTokenizer(object):
|
|||||||
# and generally don't have any Chinese data in them (there are Chinese
|
# and generally don't have any Chinese data in them (there are Chinese
|
||||||
# characters in the vocabulary because Wikipedia does have some Chinese
|
# characters in the vocabulary because Wikipedia does have some Chinese
|
||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
orig_tokens = whitespace_tokenize(text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
|
|||||||
Reference in New Issue
Block a user