clean_up_tokenization_spaces=False if unset (#31938)
* clean_up_tokenization_spaces=False if unset * deprecate warning * updating param for old models * update models * make fix-copies * fix-copies and update bert models * warning msg * update prophet and clvp * updating test since space before is arbitrarily removed * remove warning for 4.45
This commit is contained in:
@@ -88,6 +88,9 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -105,6 +108,7 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token="[MASK]",
|
mask_token="[MASK]",
|
||||||
tokenize_chinese_chars=True,
|
tokenize_chinese_chars=True,
|
||||||
strip_accents=None,
|
strip_accents=None,
|
||||||
|
clean_up_tokenization_spaces=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if not os.path.isfile(vocab_file):
|
if not os.path.isfile(vocab_file):
|
||||||
@@ -136,6 +140,7 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token=mask_token,
|
mask_token=mask_token,
|
||||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||||
strip_accents=strip_accents,
|
strip_accents=strip_accents,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -91,6 +91,9 @@ class ConvBertTokenizer(PreTrainedTokenizer):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original ConvBERT).
|
value for `lowercase` (as in the original ConvBERT).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -108,6 +111,7 @@ class ConvBertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token="[MASK]",
|
mask_token="[MASK]",
|
||||||
tokenize_chinese_chars=True,
|
tokenize_chinese_chars=True,
|
||||||
strip_accents=None,
|
strip_accents=None,
|
||||||
|
clean_up_tokenization_spaces=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if not os.path.isfile(vocab_file):
|
if not os.path.isfile(vocab_file):
|
||||||
@@ -139,6 +143,7 @@ class ConvBertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token=mask_token,
|
mask_token=mask_token,
|
||||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||||
strip_accents=strip_accents,
|
strip_accents=strip_accents,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -90,6 +90,9 @@ class DistilBertTokenizer(PreTrainedTokenizer):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -108,6 +111,7 @@ class DistilBertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token="[MASK]",
|
mask_token="[MASK]",
|
||||||
tokenize_chinese_chars=True,
|
tokenize_chinese_chars=True,
|
||||||
strip_accents=None,
|
strip_accents=None,
|
||||||
|
clean_up_tokenization_spaces=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if not os.path.isfile(vocab_file):
|
if not os.path.isfile(vocab_file):
|
||||||
@@ -138,6 +142,7 @@ class DistilBertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token=mask_token,
|
mask_token=mask_token,
|
||||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||||
strip_accents=strip_accents,
|
strip_accents=strip_accents,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -90,6 +90,9 @@ class ElectraTokenizer(PreTrainedTokenizer):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original Electra).
|
value for `lowercase` (as in the original Electra).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -107,6 +110,7 @@ class ElectraTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token="[MASK]",
|
mask_token="[MASK]",
|
||||||
tokenize_chinese_chars=True,
|
tokenize_chinese_chars=True,
|
||||||
strip_accents=None,
|
strip_accents=None,
|
||||||
|
clean_up_tokenization_spaces=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if not os.path.isfile(vocab_file):
|
if not os.path.isfile(vocab_file):
|
||||||
@@ -138,6 +142,7 @@ class ElectraTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token=mask_token,
|
mask_token=mask_token,
|
||||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||||
strip_accents=strip_accents,
|
strip_accents=strip_accents,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -107,6 +107,9 @@ class FunnelTokenizer(PreTrainedTokenizer):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -127,6 +130,7 @@ class FunnelTokenizer(PreTrainedTokenizer):
|
|||||||
eos_token="</s>",
|
eos_token="</s>",
|
||||||
tokenize_chinese_chars=True,
|
tokenize_chinese_chars=True,
|
||||||
strip_accents=None,
|
strip_accents=None,
|
||||||
|
clean_up_tokenization_spaces=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if not os.path.isfile(vocab_file):
|
if not os.path.isfile(vocab_file):
|
||||||
@@ -159,6 +163,7 @@ class FunnelTokenizer(PreTrainedTokenizer):
|
|||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||||
strip_accents=strip_accents,
|
strip_accents=strip_accents,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -91,6 +91,9 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original LayoutLM).
|
value for `lowercase` (as in the original LayoutLM).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -108,6 +111,7 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token="[MASK]",
|
mask_token="[MASK]",
|
||||||
tokenize_chinese_chars=True,
|
tokenize_chinese_chars=True,
|
||||||
strip_accents=None,
|
strip_accents=None,
|
||||||
|
clean_up_tokenization_spaces=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if not os.path.isfile(vocab_file):
|
if not os.path.isfile(vocab_file):
|
||||||
@@ -139,6 +143,7 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token=mask_token,
|
mask_token=mask_token,
|
||||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||||
strip_accents=strip_accents,
|
strip_accents=strip_accents,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -90,6 +90,9 @@ class LxmertTokenizer(PreTrainedTokenizer):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original Lxmert).
|
value for `lowercase` (as in the original Lxmert).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -107,6 +110,7 @@ class LxmertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token="[MASK]",
|
mask_token="[MASK]",
|
||||||
tokenize_chinese_chars=True,
|
tokenize_chinese_chars=True,
|
||||||
strip_accents=None,
|
strip_accents=None,
|
||||||
|
clean_up_tokenization_spaces=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if not os.path.isfile(vocab_file):
|
if not os.path.isfile(vocab_file):
|
||||||
@@ -138,6 +142,7 @@ class LxmertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token=mask_token,
|
mask_token=mask_token,
|
||||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||||
strip_accents=strip_accents,
|
strip_accents=strip_accents,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -92,6 +92,9 @@ class MobileBertTokenizer(PreTrainedTokenizer):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original MobileBERT).
|
value for `lowercase` (as in the original MobileBERT).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -109,6 +112,7 @@ class MobileBertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token="[MASK]",
|
mask_token="[MASK]",
|
||||||
tokenize_chinese_chars=True,
|
tokenize_chinese_chars=True,
|
||||||
strip_accents=None,
|
strip_accents=None,
|
||||||
|
clean_up_tokenization_spaces=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if not os.path.isfile(vocab_file):
|
if not os.path.isfile(vocab_file):
|
||||||
@@ -140,6 +144,7 @@ class MobileBertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token=mask_token,
|
mask_token=mask_token,
|
||||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||||
strip_accents=strip_accents,
|
strip_accents=strip_accents,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -108,6 +108,9 @@ class MPNetTokenizer(PreTrainedTokenizer):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -128,6 +131,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token="<mask>",
|
mask_token="<mask>",
|
||||||
tokenize_chinese_chars=True,
|
tokenize_chinese_chars=True,
|
||||||
strip_accents=None,
|
strip_accents=None,
|
||||||
|
clean_up_tokenization_spaces=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
|
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
|
||||||
@@ -170,6 +174,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token=mask_token,
|
mask_token=mask_token,
|
||||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||||
strip_accents=strip_accents,
|
strip_accents=strip_accents,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -308,6 +308,9 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original BERT).
|
value for `lowercase` (as in the original BERT).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -330,6 +333,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token: Optional[str] = "[MASK]",
|
mask_token: Optional[str] = "[MASK]",
|
||||||
tokenize_chinese_chars: Optional[bool] = True,
|
tokenize_chinese_chars: Optional[bool] = True,
|
||||||
strip_accents: Optional[bool] = None,
|
strip_accents: Optional[bool] = None,
|
||||||
|
clean_up_tokenization_spaces: bool = True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if not os.path.isfile(vocab_file):
|
if not os.path.isfile(vocab_file):
|
||||||
@@ -360,6 +364,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token=mask_token,
|
mask_token=mask_token,
|
||||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||||
strip_accents=strip_accents,
|
strip_accents=strip_accents,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -91,6 +91,9 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
|
|||||||
strip_accents (`bool`, *optional*):
|
strip_accents (`bool`, *optional*):
|
||||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||||
value for `lowercase` (as in the original SqueezeBERT).
|
value for `lowercase` (as in the original SqueezeBERT).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -108,6 +111,7 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token="[MASK]",
|
mask_token="[MASK]",
|
||||||
tokenize_chinese_chars=True,
|
tokenize_chinese_chars=True,
|
||||||
strip_accents=None,
|
strip_accents=None,
|
||||||
|
clean_up_tokenization_spaces=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if not os.path.isfile(vocab_file):
|
if not os.path.isfile(vocab_file):
|
||||||
@@ -139,6 +143,7 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
|
|||||||
mask_token=mask_token,
|
mask_token=mask_token,
|
||||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||||
strip_accents=strip_accents,
|
strip_accents=strip_accents,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -225,6 +225,9 @@ class TapasTokenizer(PreTrainedTokenizer):
|
|||||||
Minimum length of each question in terms of tokens (will be skipped otherwise).
|
Minimum length of each question in terms of tokens (will be skipped otherwise).
|
||||||
max_question_length (`int`, *optional*):
|
max_question_length (`int`, *optional*):
|
||||||
Maximum length of each question in terms of tokens (will be skipped otherwise).
|
Maximum length of each question in terms of tokens (will be skipped otherwise).
|
||||||
|
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
|
||||||
|
extra spaces.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -252,6 +255,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
|||||||
max_question_length=None,
|
max_question_length=None,
|
||||||
model_max_length: int = 512,
|
model_max_length: int = 512,
|
||||||
additional_special_tokens: Optional[List[str]] = None,
|
additional_special_tokens: Optional[List[str]] = None,
|
||||||
|
clean_up_tokenization_spaces=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if not is_pandas_available():
|
if not is_pandas_available():
|
||||||
@@ -322,6 +326,7 @@ class TapasTokenizer(PreTrainedTokenizer):
|
|||||||
max_question_length=max_question_length,
|
max_question_length=max_question_length,
|
||||||
model_max_length=model_max_length,
|
model_max_length=model_max_length,
|
||||||
additional_special_tokens=additional_special_tokens,
|
additional_special_tokens=additional_special_tokens,
|
||||||
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1622,7 +1622,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# By default, cleaning tokenization spaces for both fast and slow tokenizers
|
# By default, cleaning tokenization spaces for both fast and slow tokenizers
|
||||||
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
|
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
|
||||||
|
|
||||||
# By default, do not split special tokens for both fast and slow tokenizers
|
# By default, do not split special tokens for both fast and slow tokenizers
|
||||||
self.split_special_tokens = kwargs.pop("split_special_tokens", False)
|
self.split_special_tokens = kwargs.pop("split_special_tokens", False)
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
|
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "lower newer"
|
input_text = "lower newer"
|
||||||
output_text = "lower newer"
|
output_text = "lower[SPACE]newer"
|
||||||
return input_text, output_text
|
return input_text, output_text
|
||||||
|
|
||||||
# Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens
|
# Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens
|
||||||
|
|||||||
Reference in New Issue
Block a user