Authorize last version of tokenizer (#9799)
* Authorize last version of tokenizer * Update version table * Fix conversion of spm tokenizers and fix some hub links * Bump tokenizers version to 0.10.1rc1 * Add script to check tokenizers conversion with XNLI * Add some more mask_token lstrip support * Must modify mask_token in slow tokenizers too * Keep using the old method for Pegasus * add missing import Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
@@ -22,7 +22,7 @@ from typing import List, Optional, Tuple
|
||||
|
||||
import sentencepiece as spm
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
@@ -127,6 +127,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
||||
mask_token="[MASK]",
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
remove_space=remove_space,
|
||||
|
||||
@@ -20,6 +20,7 @@ from shutil import copyfile
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from ...file_utils import is_sentencepiece_available
|
||||
from ...tokenization_utils import AddedToken
|
||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from ...utils import logging
|
||||
|
||||
@@ -134,6 +135,9 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
|
||||
mask_token="[MASK]",
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
|
||||
@@ -21,7 +21,7 @@ from typing import List, Optional, Tuple
|
||||
|
||||
import sentencepiece as spm
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
@@ -112,6 +112,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
|
||||
mask_token="<mask>",
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
|
||||
@@ -20,6 +20,7 @@ from shutil import copyfile
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from ...file_utils import is_sentencepiece_available
|
||||
from ...tokenization_utils import AddedToken
|
||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from ...utils import logging
|
||||
|
||||
@@ -119,6 +120,9 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
|
||||
mask_token="<mask>",
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
|
||||
@@ -21,7 +21,7 @@ from typing import List, Optional, Tuple
|
||||
|
||||
import sentencepiece as spm
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
@@ -116,6 +116,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
|
||||
@@ -20,6 +20,7 @@ from shutil import copyfile
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from ...file_utils import is_sentencepiece_available
|
||||
from ...tokenization_utils import AddedToken
|
||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from ...utils import logging
|
||||
|
||||
@@ -123,6 +124,9 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
|
||||
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
|
||||
@@ -27,7 +27,7 @@ SPIECE_UNDERLINE = "▁"
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/spiece.model"}
|
||||
"vocab_file": {"google/pegasus-xsum": "https://huggingface.co/google/pegasus-xsum/resolve/main/spiece.model"}
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
|
||||
@@ -38,8 +38,10 @@ SPIECE_UNDERLINE = "▁"
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/spiece.model"},
|
||||
"tokenizer_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/tokenizer.json"},
|
||||
"vocab_file": {"google/pegasus-xsum": "https://huggingface.co/google/pegasus-xsum/resolve/main/spiece.model"},
|
||||
"tokenizer_file": {
|
||||
"google/pegasus-xsum": "https://huggingface.co/google/pegasus-xsum/resolve/main/tokenizer.json"
|
||||
},
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
|
||||
@@ -42,7 +42,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
|
||||
####################################################
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/spiece.model"
|
||||
"google/reformer-crime-and-punishment": "https://huggingface.co/google/reformer-crime-and-punishment/resolve/main/spiece.model"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -47,10 +47,10 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.
|
||||
####################################################
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/spiece.model"
|
||||
"google/reformer-crime-and-punishment": "https://huggingface.co/google/reformer-crime-and-punishment/resolve/main/spiece.model"
|
||||
},
|
||||
"tokenizer_file": {
|
||||
"google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/tokenizer.json"
|
||||
"google/reformer-crime-and-punishment": "https://huggingface.co/google/reformer-crime-and-punishment/resolve/main/tokenizer.json"
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ from typing import List, Optional, Tuple
|
||||
|
||||
import sentencepiece as spm
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
@@ -117,6 +117,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
||||
mask_token="<mask>",
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
|
||||
@@ -20,6 +20,7 @@ from shutil import copyfile
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from ...file_utils import is_sentencepiece_available
|
||||
from ...tokenization_utils import AddedToken
|
||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from ...utils import logging
|
||||
|
||||
@@ -127,6 +128,9 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
|
||||
mask_token="<mask>",
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
|
||||
@@ -23,7 +23,7 @@ from typing import List, Optional, Tuple
|
||||
import sentencepiece as spm
|
||||
|
||||
from ...file_utils import SPIECE_UNDERLINE
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
@@ -126,6 +126,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
additional_special_tokens=["<eop>", "<eod>"],
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
do_lower_case=do_lower_case,
|
||||
remove_space=remove_space,
|
||||
|
||||
@@ -20,6 +20,7 @@ from shutil import copyfile
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from ...file_utils import is_sentencepiece_available
|
||||
from ...tokenization_utils import AddedToken
|
||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from ...utils import logging
|
||||
|
||||
@@ -138,6 +139,9 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast):
|
||||
additional_special_tokens=["<eop>", "<eod>"],
|
||||
**kwargs
|
||||
):
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
|
||||
Reference in New Issue
Block a user