fix: Text splitting in the BasicTokenizer (#22280)
* fix: Apostraphe splitting in the BasicTokenizer for CLIPTokenizer * account for apostrophe at start of new word * remove _run_split_on_punc, use re.findall instead * remove debugging, make style and quality * use pattern and punc splitting, repo-consistency will fail * remove commented out debugging * adds bool args to BasicTokenizer, remove pattern * do_split_on_punc default True * clean stray comments and line breaks * rebase, repo-consistency * update to just do punctuation split * add unicode normalizing back * remove redundant line
This commit is contained in:
@@ -385,20 +385,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -417,7 +427,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -445,7 +457,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -748,20 +748,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -780,7 +790,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -808,7 +820,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -126,20 +126,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -158,7 +168,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -186,7 +198,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
@@ -316,7 +328,7 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
|||||||
self.fix_text = ftfy.fix_text
|
self.fix_text = ftfy.fix_text
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
|
logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
|
||||||
self.nlp = BasicTokenizer(do_lower_case=True)
|
self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False)
|
||||||
self.fix_text = None
|
self.fix_text = None
|
||||||
|
|
||||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||||
|
|||||||
@@ -325,20 +325,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -357,7 +367,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -385,7 +397,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -350,20 +350,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -382,7 +392,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -410,7 +422,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -342,20 +342,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -374,7 +384,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -402,7 +414,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -359,20 +359,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -391,7 +401,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -419,7 +431,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -143,20 +143,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -175,7 +185,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -203,7 +215,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -324,20 +324,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -356,7 +366,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -384,7 +396,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -1362,20 +1362,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -1394,7 +1404,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -1422,7 +1434,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -316,20 +316,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -348,7 +358,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -376,7 +388,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -314,20 +314,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -346,7 +356,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -374,7 +386,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -342,20 +342,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -374,7 +384,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -402,7 +414,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -71,20 +71,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -103,7 +113,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -131,7 +143,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -72,20 +72,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -104,7 +114,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -132,7 +144,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -333,20 +333,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -365,7 +375,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -393,7 +405,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -931,20 +931,30 @@ class RoCBertBasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -963,7 +973,9 @@ class RoCBertBasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -991,7 +1003,7 @@ class RoCBertBasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -107,20 +107,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -139,7 +149,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -167,7 +179,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -328,20 +328,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -360,7 +370,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -388,7 +400,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -2055,20 +2055,30 @@ class BasicTokenizer(object):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
do_split_on_punc (`bool`, *optional*, defaults to `True`):
|
||||||
|
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
|
||||||
|
the full context of the words, such as contractions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_lower_case=True,
|
||||||
|
never_split=None,
|
||||||
|
tokenize_chinese_chars=True,
|
||||||
|
strip_accents=None,
|
||||||
|
do_split_on_punc=True,
|
||||||
|
):
|
||||||
if never_split is None:
|
if never_split is None:
|
||||||
never_split = []
|
never_split = []
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
self.never_split = set(never_split)
|
self.never_split = set(never_split)
|
||||||
self.tokenize_chinese_chars = tokenize_chinese_chars
|
self.tokenize_chinese_chars = tokenize_chinese_chars
|
||||||
self.strip_accents = strip_accents
|
self.strip_accents = strip_accents
|
||||||
|
self.do_split_on_punc = do_split_on_punc
|
||||||
|
|
||||||
def tokenize(self, text, never_split=None):
|
def tokenize(self, text, never_split=None):
|
||||||
"""
|
"""
|
||||||
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
|
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
|
||||||
WordPieceTokenizer.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
never_split (`List[str]`, *optional*)
|
never_split (`List[str]`, *optional*)
|
||||||
@@ -2087,7 +2097,9 @@ class BasicTokenizer(object):
|
|||||||
# words in the English Wikipedia.).
|
# words in the English Wikipedia.).
|
||||||
if self.tokenize_chinese_chars:
|
if self.tokenize_chinese_chars:
|
||||||
text = self._tokenize_chinese_chars(text)
|
text = self._tokenize_chinese_chars(text)
|
||||||
orig_tokens = whitespace_tokenize(text)
|
# prevents treating the same character with different unicode codepoints as different characters
|
||||||
|
unicode_normalized_text = unicodedata.normalize("NFC", text)
|
||||||
|
orig_tokens = whitespace_tokenize(unicode_normalized_text)
|
||||||
split_tokens = []
|
split_tokens = []
|
||||||
for token in orig_tokens:
|
for token in orig_tokens:
|
||||||
if token not in never_split:
|
if token not in never_split:
|
||||||
@@ -2115,7 +2127,7 @@ class BasicTokenizer(object):
|
|||||||
|
|
||||||
def _run_split_on_punc(self, text, never_split=None):
|
def _run_split_on_punc(self, text, never_split=None):
|
||||||
"""Splits punctuation on a piece of text."""
|
"""Splits punctuation on a piece of text."""
|
||||||
if never_split is not None and text in never_split:
|
if not self.do_split_on_punc or (never_split is not None and text in never_split):
|
||||||
return [text]
|
return [text]
|
||||||
chars = list(text)
|
chars = list(text)
|
||||||
i = 0
|
i = 0
|
||||||
|
|||||||
@@ -182,6 +182,12 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
|
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_basic_tokenizer_splits_on_punctuation(self):
|
||||||
|
tokenizer = BasicTokenizer()
|
||||||
|
text = "a\n'll !!to?'d of, can't."
|
||||||
|
expected = ["a", "'", "ll", "!", "!", "to", "?", "'", "d", "of", ",", "can", "'", "t", "."]
|
||||||
|
self.assertListEqual(tokenizer.tokenize(text), expected)
|
||||||
|
|
||||||
def test_wordpiece_tokenizer(self):
|
def test_wordpiece_tokenizer(self):
|
||||||
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
|
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
|
||||||
|
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
|
text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
|
||||||
text_tokenized_s = tokenizer_s.tokenize(text)
|
text_tokenized_s = tokenizer_s.tokenize(text)
|
||||||
text_tokenized_r = tokenizer_r.tokenize(text)
|
text_tokenized_r = tokenizer_r.tokenize(text)
|
||||||
|
|
||||||
@@ -122,7 +122,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
# "\u0085", # (next line)
|
# "\u0085", # (next line)
|
||||||
]
|
]
|
||||||
|
|
||||||
# The tokenization is not identical for the character "\u0085" (next line). The slow version transforms
|
# The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms
|
||||||
# it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
|
# it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
|
||||||
# space (and thus into an empty list).
|
# space (and thus into an empty list).
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user