fix: Text splitting in the BasicTokenizer (#22280)

* fix: Apostraphe splitting in the BasicTokenizer for CLIPTokenizer

* account for apostrophe at start of new word

* remove _run_split_on_punc, use re.findall instead

* remove debugging, make style and quality

* use pattern and punc splitting, repo-consistency will fail

* remove commented out debugging

* adds bool args to BasicTokenizer, remove pattern

* do_split_on_punc default True

* clean stray comments and line breaks

* rebase, repo-consistency

* update to just do punctuation split

* add unicode normalizing back

* remove redundant line
This commit is contained in:
Connor Henderson
2023-07-11 11:07:58 -04:00
committed by GitHub
parent 2489e380e4
commit 5739726fcc
22 changed files with 349 additions and 103 deletions

View File

@@ -385,20 +385,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -417,7 +427,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -445,7 +457,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -748,20 +748,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -780,7 +790,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -808,7 +820,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -126,20 +126,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -158,7 +168,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -186,7 +198,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
@@ -316,7 +328,7 @@ class CLIPTokenizer(PreTrainedTokenizer):
self.fix_text = ftfy.fix_text self.fix_text = ftfy.fix_text
except ImportError: except ImportError:
logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.") logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True) self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False)
self.fix_text = None self.fix_text = None
with open(vocab_file, encoding="utf-8") as vocab_handle: with open(vocab_file, encoding="utf-8") as vocab_handle:

View File

@@ -325,20 +325,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -357,7 +367,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -385,7 +397,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -350,20 +350,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -382,7 +392,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -410,7 +422,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -342,20 +342,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -374,7 +384,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -402,7 +414,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -359,20 +359,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -391,7 +401,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -419,7 +431,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -143,20 +143,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -175,7 +185,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -203,7 +215,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -324,20 +324,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -356,7 +366,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -384,7 +396,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -1362,20 +1362,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -1394,7 +1404,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -1422,7 +1434,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -316,20 +316,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -348,7 +358,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -376,7 +388,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -314,20 +314,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -346,7 +356,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -374,7 +386,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -342,20 +342,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -374,7 +384,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -402,7 +414,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -71,20 +71,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -103,7 +113,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -131,7 +143,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -72,20 +72,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -104,7 +114,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -132,7 +144,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -333,20 +333,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -365,7 +375,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -393,7 +405,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -931,20 +931,30 @@ class RoCBertBasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -963,7 +973,9 @@ class RoCBertBasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -991,7 +1003,7 @@ class RoCBertBasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -107,20 +107,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -139,7 +149,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -167,7 +179,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -328,20 +328,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -360,7 +370,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -388,7 +400,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -2055,20 +2055,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
@@ -2087,7 +2097,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
@@ -2115,7 +2127,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0

View File

@@ -182,6 +182,12 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"] tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
) )
def test_basic_tokenizer_splits_on_punctuation(self):
tokenizer = BasicTokenizer()
text = "a\n'll !!to?'d of, can't."
expected = ["a", "'", "ll", "!", "!", "to", "?", "'", "d", "of", ",", "can", "'", "t", "."]
self.assertListEqual(tokenizer.tokenize(text), expected)
def test_wordpiece_tokenizer(self): def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"] vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]

View File

@@ -81,7 +81,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat" text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
text_tokenized_s = tokenizer_s.tokenize(text) text_tokenized_s = tokenizer_s.tokenize(text)
text_tokenized_r = tokenizer_r.tokenize(text) text_tokenized_r = tokenizer_r.tokenize(text)
@@ -122,7 +122,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# "\u0085", # (next line) # "\u0085", # (next line)
] ]
# The tokenization is not identical for the character "\u0085" (next line). The slow version transforms # The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms
# it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
# space (and thus into an empty list). # space (and thus into an empty list).