From 5739726fccaddde4506242974c5c27468ff86d81 Mon Sep 17 00:00:00 2001 From: Connor Henderson Date: Tue, 11 Jul 2023 11:07:58 -0400 Subject: [PATCH] fix: Text splitting in the BasicTokenizer (#22280) * fix: Apostraphe splitting in the BasicTokenizer for CLIPTokenizer * account for apostrophe at start of new word * remove _run_split_on_punc, use re.findall instead * remove debugging, make style and quality * use pattern and punc splitting, repo-consistency will fail * remove commented out debugging * adds bool args to BasicTokenizer, remove pattern * do_split_on_punc default True * clean stray comments and line breaks * rebase, repo-consistency * update to just do punctuation split * add unicode normalizing back * remove redundant line --- .../models/bert/tokenization_bert.py | 22 +++++++++++++---- .../tokenization_bert_japanese.py | 22 +++++++++++++---- .../models/clip/tokenization_clip.py | 24 ++++++++++++++----- .../models/convbert/tokenization_convbert.py | 22 +++++++++++++---- .../distilbert/tokenization_distilbert.py | 22 +++++++++++++---- .../models/electra/tokenization_electra.py | 22 +++++++++++++---- .../models/funnel/tokenization_funnel.py | 22 +++++++++++++---- .../models/herbert/tokenization_herbert.py | 22 +++++++++++++---- .../models/layoutlm/tokenization_layoutlm.py | 22 +++++++++++++---- .../layoutlmv2/tokenization_layoutlmv2.py | 22 +++++++++++++---- .../models/lxmert/tokenization_lxmert.py | 22 +++++++++++++---- .../mobilebert/tokenization_mobilebert.py | 22 +++++++++++++---- .../models/mpnet/tokenization_mpnet.py | 22 +++++++++++++---- .../models/openai/tokenization_openai.py | 22 +++++++++++++---- .../prophetnet/tokenization_prophetnet.py | 22 +++++++++++++---- .../retribert/tokenization_retribert.py | 22 +++++++++++++---- .../models/roc_bert/tokenization_roc_bert.py | 22 +++++++++++++---- .../models/roformer/tokenization_roformer.py | 22 +++++++++++++---- .../squeezebert/tokenization_squeezebert.py | 22 +++++++++++++---- .../models/tapas/tokenization_tapas.py | 22 +++++++++++++---- tests/models/bert/test_tokenization_bert.py | 6 +++++ tests/models/clip/test_tokenization_clip.py | 4 ++-- 22 files changed, 349 insertions(+), 103 deletions(-) diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py index 8d13bb4e54..536eb08640 100644 --- a/src/transformers/models/bert/tokenization_bert.py +++ b/src/transformers/models/bert/tokenization_bert.py @@ -385,20 +385,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -417,7 +427,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -445,7 +457,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index 5af9984bb9..dd29010918 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -748,20 +748,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -780,7 +790,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -808,7 +820,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py index e3ff5f8626..127480b90c 100644 --- a/src/transformers/models/clip/tokenization_clip.py +++ b/src/transformers/models/clip/tokenization_clip.py @@ -126,20 +126,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -158,7 +168,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -186,7 +198,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 @@ -316,7 +328,7 @@ class CLIPTokenizer(PreTrainedTokenizer): self.fix_text = ftfy.fix_text except ImportError: logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.") - self.nlp = BasicTokenizer(do_lower_case=True) + self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False) self.fix_text = None with open(vocab_file, encoding="utf-8") as vocab_handle: diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py index cbee21aafe..4fbed8fe10 100644 --- a/src/transformers/models/convbert/tokenization_convbert.py +++ b/src/transformers/models/convbert/tokenization_convbert.py @@ -325,20 +325,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -357,7 +367,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -385,7 +397,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py index 76582ae4ea..0259682586 100644 --- a/src/transformers/models/distilbert/tokenization_distilbert.py +++ b/src/transformers/models/distilbert/tokenization_distilbert.py @@ -350,20 +350,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -382,7 +392,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -410,7 +422,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py index 673c1db611..e202f773ef 100644 --- a/src/transformers/models/electra/tokenization_electra.py +++ b/src/transformers/models/electra/tokenization_electra.py @@ -342,20 +342,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -374,7 +384,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -402,7 +414,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py index 245694bfac..f085fd7c47 100644 --- a/src/transformers/models/funnel/tokenization_funnel.py +++ b/src/transformers/models/funnel/tokenization_funnel.py @@ -359,20 +359,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -391,7 +401,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -419,7 +431,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py index 3d07e68e18..91ce0dcca5 100644 --- a/src/transformers/models/herbert/tokenization_herbert.py +++ b/src/transformers/models/herbert/tokenization_herbert.py @@ -143,20 +143,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -175,7 +185,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -203,7 +215,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py index 99f517c6a2..57c29d5870 100644 --- a/src/transformers/models/layoutlm/tokenization_layoutlm.py +++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py @@ -324,20 +324,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -356,7 +366,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -384,7 +396,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py index c6a8857325..1799cc2921 100644 --- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py @@ -1362,20 +1362,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -1394,7 +1404,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -1422,7 +1434,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py index 84d05bebce..daa761878d 100644 --- a/src/transformers/models/lxmert/tokenization_lxmert.py +++ b/src/transformers/models/lxmert/tokenization_lxmert.py @@ -316,20 +316,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -348,7 +358,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -376,7 +388,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py index 0ccf9efe02..63c0ab28a7 100644 --- a/src/transformers/models/mobilebert/tokenization_mobilebert.py +++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py @@ -314,20 +314,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -346,7 +356,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -374,7 +386,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py index 57a06beeea..f1347da08a 100644 --- a/src/transformers/models/mpnet/tokenization_mpnet.py +++ b/src/transformers/models/mpnet/tokenization_mpnet.py @@ -342,20 +342,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -374,7 +384,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -402,7 +414,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/openai/tokenization_openai.py b/src/transformers/models/openai/tokenization_openai.py index 36035eafec..0a7f93a7b2 100644 --- a/src/transformers/models/openai/tokenization_openai.py +++ b/src/transformers/models/openai/tokenization_openai.py @@ -71,20 +71,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -103,7 +113,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -131,7 +143,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py index 36104d49fb..03e9083e74 100644 --- a/src/transformers/models/prophetnet/tokenization_prophetnet.py +++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py @@ -72,20 +72,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -104,7 +114,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -132,7 +144,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/retribert/tokenization_retribert.py b/src/transformers/models/retribert/tokenization_retribert.py index 0c04c363eb..b4a0f06192 100644 --- a/src/transformers/models/retribert/tokenization_retribert.py +++ b/src/transformers/models/retribert/tokenization_retribert.py @@ -333,20 +333,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -365,7 +375,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -393,7 +405,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py index 4338c098ba..cee778dc87 100644 --- a/src/transformers/models/roc_bert/tokenization_roc_bert.py +++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py @@ -931,20 +931,30 @@ class RoCBertBasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -963,7 +973,9 @@ class RoCBertBasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -991,7 +1003,7 @@ class RoCBertBasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py index 1e4907f50c..dc406fa480 100644 --- a/src/transformers/models/roformer/tokenization_roformer.py +++ b/src/transformers/models/roformer/tokenization_roformer.py @@ -107,20 +107,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -139,7 +149,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -167,7 +179,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert.py b/src/transformers/models/squeezebert/tokenization_squeezebert.py index ed7be941e0..ccce92809e 100644 --- a/src/transformers/models/squeezebert/tokenization_squeezebert.py +++ b/src/transformers/models/squeezebert/tokenization_squeezebert.py @@ -328,20 +328,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -360,7 +370,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -388,7 +400,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index 0bd558aee8..c3d35d0c87 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -2055,20 +2055,30 @@ class BasicTokenizer(object): strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. """ - def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): if never_split is None: never_split = [] self.do_lower_case = do_lower_case self.never_split = set(never_split) self.tokenize_chinese_chars = tokenize_chinese_chars self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc def tokenize(self, text, never_split=None): """ - Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see - WordPieceTokenizer. + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. Args: never_split (`List[str]`, *optional*) @@ -2087,7 +2097,9 @@ class BasicTokenizer(object): # words in the English Wikipedia.). if self.tokenize_chinese_chars: text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) split_tokens = [] for token in orig_tokens: if token not in never_split: @@ -2115,7 +2127,7 @@ class BasicTokenizer(object): def _run_split_on_punc(self, text, never_split=None): """Splits punctuation on a piece of text.""" - if never_split is not None and text in never_split: + if not self.do_split_on_punc or (never_split is not None and text in never_split): return [text] chars = list(text) i = 0 diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py index c5ebd6dbf3..f9383756e3 100644 --- a/tests/models/bert/test_tokenization_bert.py +++ b/tests/models/bert/test_tokenization_bert.py @@ -182,6 +182,12 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"] ) + def test_basic_tokenizer_splits_on_punctuation(self): + tokenizer = BasicTokenizer() + text = "a\n'll !!to?'d of, can't." + expected = ["a", "'", "ll", "!", "!", "to", "?", "'", "d", "of", ",", "can", "'", "t", "."] + self.assertListEqual(tokenizer.tokenize(text), expected) + def test_wordpiece_tokenizer(self): vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"] diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py index fc95826710..27387be42b 100644 --- a/tests/models/clip/test_tokenization_clip.py +++ b/tests/models/clip/test_tokenization_clip.py @@ -81,7 +81,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat" + text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d." text_tokenized_s = tokenizer_s.tokenize(text) text_tokenized_r = tokenizer_r.tokenize(text) @@ -122,7 +122,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # "\u0085", # (next line) ] - # The tokenization is not identical for the character "\u0085" (next line). The slow version transforms + # The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a # space (and thus into an empty list).