fix CLIP fast tokenizer and change some properties of the slow version (#15067)
Very big changes concerning the tokenizer fast of CLIP which did not correspond to the tokenizer slow of CLIP Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -559,6 +559,10 @@ jobs:
|
|||||||
if [ -f test_list.txt ]; then
|
if [ -f test_list.txt ]; then
|
||||||
python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt
|
python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt
|
||||||
fi
|
fi
|
||||||
|
- run: |
|
||||||
|
if [ -f test_list.txt ]; then
|
||||||
|
python -m pytest -n 1 tests/test_tokenization_clip.py --dist=loadfile -s --make-reports=tests_tokenization_clip --durations=100 | tee tests_output.txt
|
||||||
|
fi
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/tests_output.txt
|
path: ~/transformers/tests_output.txt
|
||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -105,6 +105,7 @@ _deps = [
|
|||||||
"filelock",
|
"filelock",
|
||||||
"flake8>=3.8.3",
|
"flake8>=3.8.3",
|
||||||
"flax>=0.3.5",
|
"flax>=0.3.5",
|
||||||
|
"ftfy",
|
||||||
"fugashi>=1.0",
|
"fugashi>=1.0",
|
||||||
"GitPython<3.1.19",
|
"GitPython<3.1.19",
|
||||||
"huggingface-hub>=0.1.0,<1.0",
|
"huggingface-hub>=0.1.0,<1.0",
|
||||||
@@ -242,6 +243,7 @@ else:
|
|||||||
extras["flax"] = deps_list("jax", "jaxlib", "flax", "optax")
|
extras["flax"] = deps_list("jax", "jaxlib", "flax", "optax")
|
||||||
|
|
||||||
extras["tokenizers"] = deps_list("tokenizers")
|
extras["tokenizers"] = deps_list("tokenizers")
|
||||||
|
extras["ftfy"] = deps_list("ftfy")
|
||||||
extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools")
|
extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools")
|
||||||
extras["onnx"] = deps_list("onnxconverter-common", "tf2onnx") + extras["onnxruntime"]
|
extras["onnx"] = deps_list("onnxconverter-common", "tf2onnx") + extras["onnxruntime"]
|
||||||
extras["modelcreation"] = deps_list("cookiecutter")
|
extras["modelcreation"] = deps_list("cookiecutter")
|
||||||
|
|||||||
@@ -823,6 +823,7 @@ class CLIPConverter(Converter):
|
|||||||
def converted(self) -> Tokenizer:
|
def converted(self) -> Tokenizer:
|
||||||
vocab = self.original_tokenizer.encoder
|
vocab = self.original_tokenizer.encoder
|
||||||
merges = list(self.original_tokenizer.bpe_ranks.keys())
|
merges = list(self.original_tokenizer.bpe_ranks.keys())
|
||||||
|
unk_token = self.original_tokenizer.unk_token
|
||||||
|
|
||||||
tokenizer = Tokenizer(
|
tokenizer = Tokenizer(
|
||||||
BPE(
|
BPE(
|
||||||
@@ -832,13 +833,32 @@ class CLIPConverter(Converter):
|
|||||||
continuing_subword_prefix="",
|
continuing_subword_prefix="",
|
||||||
end_of_word_suffix="</w>",
|
end_of_word_suffix="</w>",
|
||||||
fuse_unk=False,
|
fuse_unk=False,
|
||||||
|
unk_token=str(unk_token),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.original_tokenizer.add_prefix_space)
|
tokenizer.normalizer = normalizers.Sequence(
|
||||||
|
[normalizers.NFC(), normalizers.Replace(Regex(r"\s+"), " "), normalizers.Lowercase()]
|
||||||
|
)
|
||||||
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
||||||
|
[
|
||||||
|
pre_tokenizers.Split(
|
||||||
|
Regex(r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+"""),
|
||||||
|
behavior="removed",
|
||||||
|
invert=True,
|
||||||
|
),
|
||||||
|
pre_tokenizers.ByteLevel(add_prefix_space=False),
|
||||||
|
]
|
||||||
|
)
|
||||||
tokenizer.decoder = decoders.ByteLevel()
|
tokenizer.decoder = decoders.ByteLevel()
|
||||||
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
|
|
||||||
|
|
||||||
|
# Hack to have a ByteLevel and TemplaceProcessor
|
||||||
|
tokenizer.post_processor = processors.RobertaProcessing(
|
||||||
|
sep=(self.original_tokenizer.eos_token, self.original_tokenizer.eos_token_id),
|
||||||
|
cls=(self.original_tokenizer.bos_token, self.original_tokenizer.bos_token_id),
|
||||||
|
add_prefix_space=False,
|
||||||
|
trim_offsets=False,
|
||||||
|
)
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ deps = {
|
|||||||
"filelock": "filelock",
|
"filelock": "filelock",
|
||||||
"flake8": "flake8>=3.8.3",
|
"flake8": "flake8>=3.8.3",
|
||||||
"flax": "flax>=0.3.5",
|
"flax": "flax>=0.3.5",
|
||||||
|
"ftfy": "ftfy",
|
||||||
"fugashi": "fugashi>=1.0",
|
"fugashi": "fugashi>=1.0",
|
||||||
"GitPython": "GitPython<3.1.19",
|
"GitPython": "GitPython<3.1.19",
|
||||||
"huggingface-hub": "huggingface-hub>=0.1.0,<1.0",
|
"huggingface-hub": "huggingface-hub>=0.1.0,<1.0",
|
||||||
|
|||||||
@@ -158,6 +158,13 @@ except importlib_metadata.PackageNotFoundError:
|
|||||||
except importlib_metadata.PackageNotFoundError:
|
except importlib_metadata.PackageNotFoundError:
|
||||||
_faiss_available = False
|
_faiss_available = False
|
||||||
|
|
||||||
|
_ftfy_available = importlib.util.find_spec("ftfy") is not None
|
||||||
|
try:
|
||||||
|
_ftfy_version = importlib_metadata.version("ftfy")
|
||||||
|
logger.debug(f"Successfully imported ftfy version {_ftfy_version}")
|
||||||
|
except importlib_metadata.PackageNotFoundError:
|
||||||
|
_ftfy_available = False
|
||||||
|
|
||||||
|
|
||||||
coloredlogs = importlib.util.find_spec("coloredlogs") is not None
|
coloredlogs = importlib.util.find_spec("coloredlogs") is not None
|
||||||
try:
|
try:
|
||||||
@@ -441,6 +448,10 @@ def is_flax_available():
|
|||||||
return _flax_available
|
return _flax_available
|
||||||
|
|
||||||
|
|
||||||
|
def is_ftfy_available():
|
||||||
|
return _ftfy_available
|
||||||
|
|
||||||
|
|
||||||
def is_torch_tpu_available():
|
def is_torch_tpu_available():
|
||||||
if not _torch_available:
|
if not _torch_available:
|
||||||
return False
|
return False
|
||||||
@@ -516,10 +527,6 @@ def is_spacy_available():
|
|||||||
return importlib.util.find_spec("spacy") is not None
|
return importlib.util.find_spec("spacy") is not None
|
||||||
|
|
||||||
|
|
||||||
def is_ftfy_available():
|
|
||||||
return importlib.util.find_spec("ftfy") is not None
|
|
||||||
|
|
||||||
|
|
||||||
def is_in_notebook():
|
def is_in_notebook():
|
||||||
try:
|
try:
|
||||||
# Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py
|
# Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py
|
||||||
@@ -722,6 +729,13 @@ FLAX_IMPORT_ERROR = """
|
|||||||
installation page: https://github.com/google/flax and follow the ones that match your environment.
|
installation page: https://github.com/google/flax and follow the ones that match your environment.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# docstyle-ignore
|
||||||
|
FTFY_IMPORT_ERROR = """
|
||||||
|
{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the
|
||||||
|
installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
|
||||||
|
that match your environment.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
SCATTER_IMPORT_ERROR = """
|
SCATTER_IMPORT_ERROR = """
|
||||||
@@ -801,6 +815,7 @@ BACKENDS_MAPPING = OrderedDict(
|
|||||||
("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
|
("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
|
||||||
("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
|
("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
|
||||||
("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
|
("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
|
||||||
|
("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
|
||||||
("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
|
("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
|
||||||
("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
|
("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
|
||||||
("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
|
("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
|
|
||||||
|
|
||||||
PRETRAINED_INIT_CONFIGURATION = {
|
PRETRAINED_INIT_CONFIGURATION = {
|
||||||
"openai/clip-vit-base-patch32": {"do_lower_case": True},
|
"openai/clip-vit-base-patch32": {},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -101,19 +101,6 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
|||||||
"""
|
"""
|
||||||
Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
|
Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.
|
||||||
|
|
||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
|
||||||
|
|
||||||
|
|
||||||
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
|
|
||||||
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
|
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
|
||||||
this superclass for more information regarding those methods.
|
this superclass for more information regarding those methods.
|
||||||
|
|
||||||
@@ -132,9 +119,6 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
|||||||
The beginning of sequence token.
|
The beginning of sequence token.
|
||||||
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
||||||
The end of sequence token.
|
The end of sequence token.
|
||||||
add_prefix_space (`bool`, *optional*, defaults to `False`):
|
|
||||||
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
|
|
||||||
other word. (CLIP tokenizer detect beginning of words by the preceding space).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -151,8 +135,6 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
|||||||
bos_token="<|startoftext|>",
|
bos_token="<|startoftext|>",
|
||||||
eos_token="<|endoftext|>",
|
eos_token="<|endoftext|>",
|
||||||
pad_token="<|endoftext|>", # hack to enable padding
|
pad_token="<|endoftext|>", # hack to enable padding
|
||||||
add_prefix_space=False,
|
|
||||||
do_lower_case=True,
|
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||||
@@ -165,8 +147,6 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
|||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
pad_token=pad_token,
|
pad_token=pad_token,
|
||||||
add_prefix_space=add_prefix_space,
|
|
||||||
do_lower_case=do_lower_case,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -190,21 +170,12 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
|||||||
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
|
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
|
||||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
||||||
self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}
|
self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}
|
||||||
self.add_prefix_space = add_prefix_space
|
|
||||||
|
|
||||||
self.pat = re.compile(
|
self.pat = re.compile(
|
||||||
r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
|
r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Very ugly hack to enable padding
|
|
||||||
@property
|
|
||||||
def pad_token_id(self) -> Optional[int]:
|
|
||||||
"""
|
|
||||||
`Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
|
|
||||||
"""
|
|
||||||
return 0
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocab_size(self):
|
def vocab_size(self):
|
||||||
return len(self.encoder)
|
return len(self.encoder)
|
||||||
@@ -232,9 +203,12 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
|||||||
Returns:
|
Returns:
|
||||||
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
||||||
"""
|
"""
|
||||||
|
bos_token = [self.bos_token_id]
|
||||||
|
eos_token = [self.eos_token_id]
|
||||||
|
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
return bos_token + token_ids_0 + eos_token
|
||||||
return [self.bos_token_id] + token_ids_0 + token_ids_1 + [self.eos_token_id]
|
return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token
|
||||||
|
|
||||||
def get_special_tokens_mask(
|
def get_special_tokens_mask(
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||||
@@ -262,7 +236,30 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
|
||||||
|
def create_token_type_ids_from_sequences(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
|
"""
|
||||||
|
Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
|
||||||
|
zeros is returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (`List[int]`):
|
||||||
|
List of IDs.
|
||||||
|
token_ids_1 (`List[int]`, *optional*):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[int]`: List of zeros.
|
||||||
|
"""
|
||||||
|
bos_token = [self.bos_token_id]
|
||||||
|
eos_token = [self.eos_token_id]
|
||||||
|
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(bos_token + token_ids_0 + eos_token) * [0]
|
||||||
|
return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]
|
||||||
|
|
||||||
def bpe(self, token):
|
def bpe(self, token):
|
||||||
if token in self.cache:
|
if token in self.cache:
|
||||||
@@ -332,7 +329,8 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
|||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
"""Converts a sequence of tokens (string) in a single string."""
|
"""Converts a sequence of tokens (string) in a single string."""
|
||||||
text = "".join(tokens)
|
text = "".join(tokens)
|
||||||
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors).replace("</w>", " ")
|
byte_array = bytearray([self.byte_decoder[c] for c in text])
|
||||||
|
text = byte_array.decode("utf-8", errors=self.errors).replace("</w>", " ").strip()
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
@@ -363,9 +361,3 @@ class CLIPTokenizer(PreTrainedTokenizer):
|
|||||||
index += 1
|
index += 1
|
||||||
|
|
||||||
return vocab_file, merge_file
|
return vocab_file, merge_file
|
||||||
|
|
||||||
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
|
|
||||||
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
|
|
||||||
if is_split_into_words or add_prefix_space:
|
|
||||||
text = " " + text
|
|
||||||
return (text, kwargs)
|
|
||||||
|
|||||||
@@ -15,12 +15,10 @@
|
|||||||
"""Tokenization classes for OpenAI GPT."""
|
"""Tokenization classes for OpenAI GPT."""
|
||||||
|
|
||||||
|
|
||||||
import json
|
from typing import List, Optional, Tuple
|
||||||
from typing import Optional, Tuple
|
|
||||||
|
|
||||||
from tokenizers import pre_tokenizers
|
from tokenizers import pre_tokenizers
|
||||||
|
|
||||||
from ...tokenization_utils_base import BatchEncoding
|
|
||||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from .tokenization_clip import CLIPTokenizer
|
from .tokenization_clip import CLIPTokenizer
|
||||||
@@ -52,27 +50,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
|
Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
|
||||||
Byte-Pair-Encoding.
|
Byte-Pair-Encoding.
|
||||||
|
|
||||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
|
||||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
|
||||||
|
|
||||||
```
|
|
||||||
>>> from transformers import CLIPTokenizerFast
|
|
||||||
>>> tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
|
|
||||||
>>> tokenizer("Hello world")['input_ids']
|
|
||||||
[15496, 995]
|
|
||||||
>>> tokenizer(" Hello world")['input_ids']
|
|
||||||
[18435, 995]
|
|
||||||
```
|
|
||||||
|
|
||||||
You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
|
|
||||||
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
|
|
||||||
|
|
||||||
<Tip>
|
|
||||||
|
|
||||||
When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
|
|
||||||
|
|
||||||
</Tip>
|
|
||||||
|
|
||||||
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
|
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
|
||||||
refer to this superclass for more information regarding those methods.
|
refer to this superclass for more information regarding those methods.
|
||||||
|
|
||||||
@@ -81,9 +58,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
Path to the vocabulary file.
|
Path to the vocabulary file.
|
||||||
merges_file (`str`):
|
merges_file (`str`):
|
||||||
Path to the merges file.
|
Path to the merges file.
|
||||||
errors (`str`, *optional*, defaults to `"replace"`):
|
|
||||||
Paradigm to follow when decoding bytes to UTF-8. See
|
|
||||||
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
|
|
||||||
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
||||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
token instead.
|
token instead.
|
||||||
@@ -91,11 +65,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
The beginning of sequence token.
|
The beginning of sequence token.
|
||||||
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
||||||
The end of sequence token.
|
The end of sequence token.
|
||||||
add_prefix_space (`bool`, *optional*, defaults to `False`):
|
|
||||||
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
|
|
||||||
other word. (CLIP tokenizer detect beginning of words by the preceding space).
|
|
||||||
trim_offsets (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -113,7 +82,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
bos_token="<|startoftext|>",
|
bos_token="<|startoftext|>",
|
||||||
eos_token="<|endoftext|>",
|
eos_token="<|endoftext|>",
|
||||||
pad_token="<|endoftext|>", # hack to enable padding
|
pad_token="<|endoftext|>", # hack to enable padding
|
||||||
add_prefix_space=False,
|
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
@@ -124,44 +92,81 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
pad_token=pad_token,
|
pad_token=pad_token,
|
||||||
add_prefix_space=add_prefix_space,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
|
if not isinstance(self.backend_tokenizer.pre_tokenizer, pre_tokenizers.Sequence):
|
||||||
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
|
raise ValueError(
|
||||||
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
|
"The `backend_tokenizer` provided does not match the expected format. The CLIP tokenizer has been "
|
||||||
pre_tok_state["add_prefix_space"] = add_prefix_space
|
"heavily modified from transformers version 4.17.0. You need to convert the tokenizer you are using to be compatible with this version."
|
||||||
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
|
"The easiest way to do so is "
|
||||||
|
'`CLIPTokenizerFast.from_pretrained("path_to_local_folder_or_hub_repo, from_slow=True)`.'
|
||||||
|
" If you want to use your existing tokenizer, you will have to revert to a version prior to "
|
||||||
|
"4.17.0 of transformers."
|
||||||
|
)
|
||||||
|
|
||||||
self.add_prefix_space = add_prefix_space
|
self._wrap_decode_method_backend_tokenizer()
|
||||||
|
|
||||||
# Very ugly hack to enable padding
|
# Very ugly hack to enable padding to have a correct decoding see https://github.com/huggingface/tokenizers/issues/872
|
||||||
@property
|
def _wrap_decode_method_backend_tokenizer(self):
|
||||||
def pad_token_id(self) -> Optional[int]:
|
orig_decode_method = self.backend_tokenizer.decode
|
||||||
|
|
||||||
|
def new_decode_method(*args, **kwargs):
|
||||||
|
text = orig_decode_method(*args, **kwargs)
|
||||||
|
text = text.replace(self.backend_tokenizer.model.end_of_word_suffix, " ").strip()
|
||||||
|
return text
|
||||||
|
|
||||||
|
self.backend_tokenizer.decode = new_decode_method
|
||||||
|
|
||||||
|
def build_inputs_with_special_tokens(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
`Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
||||||
|
adding special tokens. A CLIP sequence has the following format:
|
||||||
|
|
||||||
|
- single sequence: `<|startoftext|> X <|endoftext|>`
|
||||||
|
|
||||||
|
Pairs of sequences are not the expected use case, but they will be handled without a separator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (`List[int]`):
|
||||||
|
List of IDs to which the special tokens will be added.
|
||||||
|
token_ids_1 (`List[int]`, *optional*):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
||||||
"""
|
"""
|
||||||
return 0
|
bos_token = [self.bos_token_id]
|
||||||
|
eos_token = [self.eos_token_id]
|
||||||
|
|
||||||
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
|
if token_ids_1 is None:
|
||||||
is_split_into_words = kwargs.get("is_split_into_words", False)
|
return bos_token + token_ids_0 + eos_token
|
||||||
assert self.add_prefix_space or not is_split_into_words, (
|
return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token
|
||||||
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
|
|
||||||
"to use it with pretokenized inputs."
|
|
||||||
)
|
|
||||||
|
|
||||||
return super()._batch_encode_plus(*args, **kwargs)
|
def create_token_type_ids_from_sequences(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
|
"""
|
||||||
|
Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
|
||||||
|
zeros is returned.
|
||||||
|
|
||||||
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
|
Args:
|
||||||
is_split_into_words = kwargs.get("is_split_into_words", False)
|
token_ids_0 (`List[int]`):
|
||||||
|
List of IDs.
|
||||||
|
token_ids_1 (`List[int]`, *optional*):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
assert self.add_prefix_space or not is_split_into_words, (
|
Returns:
|
||||||
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
|
`List[int]`: List of zeros.
|
||||||
"to use it with pretokenized inputs."
|
"""
|
||||||
)
|
bos_token = [self.bos_token_id]
|
||||||
|
eos_token = [self.eos_token_id]
|
||||||
|
|
||||||
return super()._encode_plus(*args, **kwargs)
|
if token_ids_1 is None:
|
||||||
|
return len(bos_token + token_ids_0 + eos_token) * [0]
|
||||||
|
return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ import unittest
|
|||||||
|
|
||||||
from transformers import CLIPTokenizer, CLIPTokenizerFast
|
from transformers import CLIPTokenizer, CLIPTokenizerFast
|
||||||
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_tokenizers
|
from transformers.testing_utils import require_ftfy, require_tokenizers
|
||||||
|
|
||||||
from .test_tokenization_common import TokenizerTesterMixin
|
from .test_tokenization_common import TokenizerTesterMixin
|
||||||
|
|
||||||
@@ -30,18 +30,20 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
tokenizer_class = CLIPTokenizer
|
tokenizer_class = CLIPTokenizer
|
||||||
rust_tokenizer_class = CLIPTokenizerFast
|
rust_tokenizer_class = CLIPTokenizerFast
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = True
|
||||||
from_pretrained_kwargs = {"add_prefix_space": True}
|
from_pretrained_kwargs = {}
|
||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super().setUp()
|
super().setUp()
|
||||||
|
# temporary addition: to test the new slow to fast converter
|
||||||
|
self.tokenizers_list = [(CLIPTokenizerFast, "SaulLu/clip-vit-base-patch32", {})]
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|endoftext|>"]
|
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>", ""]
|
merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
self.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
@@ -61,148 +63,126 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "lower newer"
|
input_text = "lower newer"
|
||||||
output_text = "lower newer "
|
output_text = "lower newer"
|
||||||
return input_text, output_text
|
return input_text, output_text
|
||||||
|
|
||||||
def test_full_tokenizer(self):
|
def test_full_tokenizer(self):
|
||||||
tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
||||||
text = "lower newer"
|
text = "lower newer"
|
||||||
bpe_tokens = ["lo", "w", "er</w>", "n", "e", "w", "er</w>"]
|
bpe_tokens = ["lo", "w", "er</w>", "n", "e", "w", "er</w>"]
|
||||||
tokens = tokenizer.tokenize(text, add_prefix_space=True)
|
tokens = tokenizer.tokenize(text)
|
||||||
self.assertListEqual(tokens, bpe_tokens)
|
self.assertListEqual(tokens, bpe_tokens)
|
||||||
|
|
||||||
input_tokens = tokens + [tokenizer.unk_token]
|
input_tokens = tokens + [tokenizer.unk_token]
|
||||||
input_bpe_tokens = [10, 2, 12, 9, 3, 2, 12, 16]
|
input_bpe_tokens = [10, 2, 16, 9, 3, 2, 16, 20]
|
||||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||||
|
|
||||||
def test_rust_and_python_full_tokenizers(self):
|
@require_ftfy
|
||||||
if not self.test_rust_tokenizer:
|
def test_check_encoding_slow_fast(self):
|
||||||
return
|
|
||||||
|
|
||||||
tokenizer = self.get_tokenizer()
|
|
||||||
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
|
|
||||||
|
|
||||||
sequence = "lower newer"
|
|
||||||
|
|
||||||
# Testing tokenization
|
|
||||||
tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
|
|
||||||
rust_tokens = rust_tokenizer.tokenize(sequence)
|
|
||||||
self.assertListEqual(tokens, rust_tokens)
|
|
||||||
|
|
||||||
# Testing conversion to ids without special tokens
|
|
||||||
ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
|
|
||||||
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
|
|
||||||
self.assertListEqual(ids, rust_ids)
|
|
||||||
|
|
||||||
# Testing conversion to ids with special tokens
|
|
||||||
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
|
|
||||||
ids = tokenizer.encode(sequence, add_prefix_space=True)
|
|
||||||
rust_ids = rust_tokenizer.encode(sequence)
|
|
||||||
self.assertListEqual(ids, rust_ids)
|
|
||||||
|
|
||||||
# Testing the unknown token
|
|
||||||
input_tokens = tokens + [rust_tokenizer.unk_token]
|
|
||||||
input_bpe_tokens = [10, 2, 12, 9, 3, 2, 12, 16]
|
|
||||||
self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
|
||||||
|
|
||||||
def test_pretokenized_inputs(self, *args, **kwargs):
|
|
||||||
# It's very difficult to mix/test pretokenization with byte-level
|
|
||||||
# And get both CLIP and Roberta to work at the same time (mostly an issue of adding a space before the string)
|
|
||||||
pass
|
|
||||||
|
|
||||||
def test_padding(self, max_length=15):
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
|
tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Simple input
|
text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
|
||||||
s = "This is a simple input"
|
text_tokenized_s = tokenizer_s.tokenize(text)
|
||||||
s2 = ["This is a simple input 1", "This is a simple input 2"]
|
text_tokenized_r = tokenizer_r.tokenize(text)
|
||||||
p = ("This is a simple input", "This is a pair")
|
|
||||||
p2 = [
|
self.assertListEqual(text_tokenized_s, text_tokenized_r)
|
||||||
("This is a simple input 1", "This is a simple input 2"),
|
|
||||||
("This is a simple pair 1", "This is a simple pair 2"),
|
# Test that the tokenization is identical on an example containing a character (Latin Small Letter A
|
||||||
|
# with Tilde) encoded in 2 different ways
|
||||||
|
text = "xa\u0303y" + " " + "x\xe3y"
|
||||||
|
text_tokenized_s = tokenizer_s.tokenize(text)
|
||||||
|
text_tokenized_r = tokenizer_r.tokenize(text)
|
||||||
|
|
||||||
|
self.assertListEqual(text_tokenized_s, text_tokenized_r)
|
||||||
|
|
||||||
|
# Test that the tokenization is identical on unicode of space type
|
||||||
|
spaces_unicodes = [
|
||||||
|
"\u0009", # (horizontal tab, '\t')
|
||||||
|
"\u000B", # (vertical tab)
|
||||||
|
"\u000C", # (form feed)
|
||||||
|
"\u0020", # (space, ' ')
|
||||||
|
"\u200E", # (left-to-right mark):w
|
||||||
|
"\u200F", # (right-to-left mark)
|
||||||
|
]
|
||||||
|
for unicode_seq in spaces_unicodes:
|
||||||
|
text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
|
||||||
|
text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
|
||||||
|
|
||||||
|
self.assertListEqual(text_tokenized_s, text_tokenized_r)
|
||||||
|
|
||||||
|
# Test that the tokenization is identical on unicode of line break type
|
||||||
|
line_break_unicodes = [
|
||||||
|
"\u000A", # (line feed, '\n')
|
||||||
|
"\r\n", # (carriage return and line feed, '\r\n')
|
||||||
|
"\u000D", # (carriage return, '\r')
|
||||||
|
"\r", # (carriage return, '\r')
|
||||||
|
"\u000D", # (carriage return, '\r')
|
||||||
|
"\u2028", # (line separator)
|
||||||
|
"\u2029", # (paragraph separator)
|
||||||
|
# "\u0085", # (next line)
|
||||||
]
|
]
|
||||||
|
|
||||||
# Simple input tests
|
# The tokenization is not identical for the character "\u0085" (next line). The slow version transforms
|
||||||
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
|
# it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
|
||||||
|
# space (and thus into an empty list).
|
||||||
|
|
||||||
# Simple input
|
for unicode_seq in line_break_unicodes:
|
||||||
self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
|
text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
|
||||||
|
text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
|
||||||
|
|
||||||
# Simple input
|
self.assertListEqual(text_tokenized_s, text_tokenized_r)
|
||||||
self.assertRaises(
|
|
||||||
ValueError,
|
def test_offsets_mapping_with_different_add_prefix_space_argument(self):
|
||||||
tokenizer_r.batch_encode_plus,
|
# Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space`
|
||||||
s2,
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
max_length=max_length,
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
padding="max_length",
|
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
|
||||||
|
text = f"{text_of_1_token} {text_of_1_token}"
|
||||||
|
|
||||||
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||||
|
pretrained_name,
|
||||||
|
use_fast=True,
|
||||||
|
)
|
||||||
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
|
self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
|
||||||
|
self.assertEqual(
|
||||||
|
encoding.offset_mapping[1],
|
||||||
|
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Pair input
|
text = f" {text}"
|
||||||
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
|
|
||||||
|
|
||||||
# Pair input
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
||||||
self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
|
pretrained_name,
|
||||||
|
use_fast=True,
|
||||||
# Pair input
|
)
|
||||||
self.assertRaises(
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
ValueError,
|
self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
|
||||||
tokenizer_r.batch_encode_plus,
|
self.assertEqual(
|
||||||
p2,
|
encoding.offset_mapping[1],
|
||||||
max_length=max_length,
|
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
padding="max_length",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_add_tokens_tokenizer(self):
|
def test_log_warning(self):
|
||||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
# Test related to the breaking change introduced in transformers v4.17.0
|
||||||
for tokenizer in tokenizers:
|
# We need to check that an error in raised when the user try to load a previous version of the tokenizer.
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
with self.assertRaises(ValueError) as context:
|
||||||
vocab_size = tokenizer.vocab_size
|
self.rust_tokenizer_class.from_pretrained("robot-test/old-clip-tokenizer")
|
||||||
all_size = len(tokenizer)
|
|
||||||
|
|
||||||
self.assertNotEqual(vocab_size, 0)
|
self.assertTrue(
|
||||||
|
context.exception.args[0].startswith(
|
||||||
|
"The `backend_tokenizer` provided does not match the expected format."
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# We usually have added tokens from the start in tests because our vocab fixtures are
|
@require_ftfy
|
||||||
# smaller than the original vocabs - let's not assert this
|
def test_tokenization_python_rust_equals(self):
|
||||||
# self.assertEqual(vocab_size, all_size)
|
super().test_tokenization_python_rust_equals()
|
||||||
|
|
||||||
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
|
# overwrite common test
|
||||||
added_toks = tokenizer.add_tokens(new_toks)
|
def test_added_tokens_do_lower_case(self):
|
||||||
vocab_size_2 = tokenizer.vocab_size
|
# CLIP always lower cases letters
|
||||||
all_size_2 = len(tokenizer)
|
pass
|
||||||
|
|
||||||
self.assertNotEqual(vocab_size_2, 0)
|
|
||||||
self.assertEqual(vocab_size, vocab_size_2)
|
|
||||||
self.assertEqual(added_toks, len(new_toks))
|
|
||||||
self.assertEqual(all_size_2, all_size + len(new_toks))
|
|
||||||
|
|
||||||
tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
|
|
||||||
|
|
||||||
self.assertGreaterEqual(len(tokens), 4)
|
|
||||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
|
||||||
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
|
||||||
|
|
||||||
new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
|
|
||||||
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
|
|
||||||
vocab_size_3 = tokenizer.vocab_size
|
|
||||||
all_size_3 = len(tokenizer)
|
|
||||||
|
|
||||||
self.assertNotEqual(vocab_size_3, 0)
|
|
||||||
self.assertEqual(vocab_size, vocab_size_3)
|
|
||||||
self.assertEqual(added_toks_2, len(new_toks_2))
|
|
||||||
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
|
||||||
|
|
||||||
tokens = tokenizer.encode(
|
|
||||||
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertGreaterEqual(len(tokens), 6)
|
|
||||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
|
||||||
self.assertGreater(tokens[0], tokens[1])
|
|
||||||
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
|
||||||
self.assertGreater(tokens[-2], tokens[-3])
|
|
||||||
self.assertEqual(tokens[0], tokenizer.eos_token_id)
|
|
||||||
# padding is very hacky in CLIPTokenizer, pad_token_id is always 0
|
|
||||||
# so skip this check
|
|
||||||
# self.assertEqual(tokens[-2], tokenizer.pad_token_id)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user