[Pegasus] Refactor Tokenizer (#8731)
* refactor * further refactor * fix the rest tomorrow * save intermediate * finish slow tokenizer * make more tests pass * finish refactor * fix comment * clean further * fix name * fix naming * Update src/transformers/models/reformer/tokenization_reformer.py * Apply suggestions from code review * Apply suggestions from code review * refactor * fix init tokenizers * refactor * improve convert * refactor * correct convert slow tokenizer * final fix for Pegasus Tok * remove ipdb * improve links
This commit is contained in:
committed by
GitHub
parent
36b60ce9e8
commit
5ced23dc84
@@ -547,10 +547,12 @@ class BertGenerationConverter(SpmConverter):
|
||||
class PegasusConverter(SpmConverter):
|
||||
def vocab(self, proto):
|
||||
vocab = [
|
||||
(self.original_tokenizer.pad_token, 0),
|
||||
(self.original_tokenizer.eos_token, 0),
|
||||
(self.original_tokenizer.pad_token, 0.0),
|
||||
(self.original_tokenizer.eos_token, 0.0),
|
||||
(self.original_tokenizer.mask_token_sent, 0.0),
|
||||
(self.original_tokenizer.mask_token, 0.0),
|
||||
]
|
||||
vocab += [(f"unk_{i}", -100) for i in range(2, 2 + self.original_tokenizer.offset)]
|
||||
vocab += [(f"<unk_{i}>", -100.0) for i in range(2, self.original_tokenizer.offset)]
|
||||
vocab += [(piece.piece, piece.score) for piece in proto.pieces[2:]]
|
||||
return vocab
|
||||
|
||||
@@ -559,13 +561,10 @@ class PegasusConverter(SpmConverter):
|
||||
|
||||
def post_processor(self):
|
||||
eos = self.original_tokenizer.eos_token
|
||||
return processors.TemplateProcessing(
|
||||
single=["$A", eos],
|
||||
pair=["$A", "$B", eos],
|
||||
special_tokens = [
|
||||
(eos, self.original_tokenizer.eos_token_id),
|
||||
],
|
||||
)
|
||||
]
|
||||
return processors.TemplateProcessing(single=["$A", eos], pair=["$A", "$B", eos], special_tokens=special_tokens)
|
||||
|
||||
|
||||
class T5Converter(SpmConverter):
|
||||
|
||||
@@ -71,10 +71,10 @@ SPIECE_UNDERLINE = "▁"
|
||||
|
||||
class AlbertTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
|
||||
<https://github.com/google/sentencepiece>`__. This tokenizer inherits from
|
||||
:class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this
|
||||
superclass for more information regarding those methods
|
||||
Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
|
||||
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__. This tokenizer
|
||||
inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
|
||||
refer to this superclass for more information regarding those methods
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
|
||||
@@ -60,8 +60,8 @@ SPIECE_UNDERLINE = "▁"
|
||||
class CamembertTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
|
||||
:class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `SentencePiece
|
||||
<https://github.com/google/sentencepiece>`__.
|
||||
:class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
|
||||
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||
methods. Users should refer to this superclass for more information regarding those methods.
|
||||
|
||||
@@ -67,7 +67,8 @@ FAIRSEQ_LANGUAGE_CODES = [
|
||||
|
||||
class MBartTokenizerFast(XLMRobertaTokenizerFast):
|
||||
"""
|
||||
Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). Based on `BPE
|
||||
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
|
||||
|
||||
:class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds
|
||||
a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`.
|
||||
|
||||
@@ -12,11 +12,16 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Dict, List, Optional
|
||||
import os
|
||||
from shutil import copyfile
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import sentencepiece as spm
|
||||
|
||||
from ...file_utils import add_start_docstrings
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
|
||||
from ..reformer.tokenization_reformer import ReformerTokenizer
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
SPIECE_UNDERLINE = "▁"
|
||||
@@ -32,31 +37,145 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
}
|
||||
|
||||
|
||||
class PegasusTokenizer(ReformerTokenizer):
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class PegasusTokenizer(PreTrainedTokenizer):
|
||||
r"""
|
||||
Construct a Pegasus tokenizer.
|
||||
Construct a PEGASUS tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
|
||||
|
||||
:class:`~transformers.PegasusTokenizer` is identical to :class:`~transformers.ReformerTokenizer` and adds a new
|
||||
:meth:`~transformers.PegasusTokenizer.prepare_seq2seq_batch`
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
|
||||
Users should refer to this superclass for more information regarding those methods.
|
||||
|
||||
Refer to superclass :class:`~transformers.ReformerTokenizer` for usage examples and documentation concerning the
|
||||
initialization parameters and other methods.
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
|
||||
contains the vocabulary necessary to instantiate a tokenizer.
|
||||
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||
The end of sequence token.
|
||||
|
||||
.. note::
|
||||
|
||||
When building a sequence using special tokens, this is not the token that is used for the end of
|
||||
sequence. The token used is the :obj:`sep_token`.
|
||||
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
|
||||
The token used for masking single token values. This is the token used when training this model with masked
|
||||
language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
|
||||
It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
|
||||
Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
|
||||
mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
|
||||
The token used for masking whole target sentences. This is the token used when training this model with gap
|
||||
sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
|
||||
pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
|
||||
Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`):
|
||||
Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
|
||||
<unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
|
||||
tokenizer
|
||||
<https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
|
||||
that uses the tokens 2 - 104 only for pretraining
|
||||
"""
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
offset = 103 # entries 2 - 104 are only used for pretraining
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["attention_mask"]
|
||||
|
||||
def __init__(self, *args, pad_token="<pad>", **kwargs):
|
||||
super().__init__(*args, **kwargs, pad_token="<pad>")
|
||||
# Don't use reserved words added_token_encoder, added_tokens_decoder because of
|
||||
# AssertionError: Non-consecutive added token '1' found. in from_pretrained
|
||||
assert len(self.added_tokens_decoder) == 0
|
||||
self.encoder: Dict[int, str] = {0: self.pad_token, 1: self.eos_token}
|
||||
# entries 2-104 are only used for pretraining and called unk_2, ...unk_104
|
||||
self.encoder.update({i: f"unk_{i}" for i in range(2, self.offset + 2)})
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
pad_token="<pad>",
|
||||
eos_token="</s>",
|
||||
unk_token="<unk>",
|
||||
mask_token="<mask_2>",
|
||||
mask_token_sent="<mask_1>",
|
||||
additional_special_tokens=None,
|
||||
**kwargs
|
||||
):
|
||||
if additional_special_tokens is not None:
|
||||
assert isinstance(
|
||||
additional_special_tokens, list
|
||||
), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
|
||||
|
||||
additional_special_tokens_extended = (
|
||||
([mask_token_sent] + additional_special_tokens)
|
||||
if mask_token_sent not in additional_special_tokens
|
||||
else additional_special_tokens
|
||||
)
|
||||
# fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
|
||||
additional_special_tokens_extended += [
|
||||
f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
|
||||
]
|
||||
|
||||
if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
|
||||
raise ValueError(
|
||||
f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
|
||||
)
|
||||
additional_special_tokens = additional_special_tokens_extended
|
||||
else:
|
||||
additional_special_tokens = [mask_token_sent]
|
||||
additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
|
||||
|
||||
super().__init__(
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
mask_token=mask_token,
|
||||
pad_token=pad_token,
|
||||
mask_token_sent=mask_token_sent,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
self.vocab_file = vocab_file
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(vocab_file)
|
||||
self.mask_token_sent = mask_token_sent
|
||||
|
||||
# add special tokens to encoder dict
|
||||
self.encoder: Dict[int, str] = {
|
||||
0: self.pad_token,
|
||||
1: self.eos_token,
|
||||
2: self.mask_token_sent,
|
||||
3: self.mask_token,
|
||||
}
|
||||
# entries 2-104 are only used for pretraining and called <mask_1>, <mask_2>, unk_2, ...unk_102
|
||||
# mask_token_sent is already added to list -> so start at 1
|
||||
self.encoder.update({i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1)})
|
||||
self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.sp_model) + self.offset
|
||||
|
||||
def get_vocab(self) -> Dict[str, int]:
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__ = d
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(self.vocab_file)
|
||||
|
||||
def _tokenize(self, text, sample=False):
|
||||
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
|
||||
if not sample:
|
||||
pieces = self.sp_model.EncodeAsPieces(text)
|
||||
else:
|
||||
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
|
||||
return pieces
|
||||
|
||||
def _convert_token_to_id(self, token: str) -> int:
|
||||
""" Converts a token (str) to an id using the vocab. """
|
||||
if token in self.decoder:
|
||||
@@ -73,13 +192,13 @@ class PegasusTokenizer(ReformerTokenizer):
|
||||
elif index in self.added_tokens_encoder:
|
||||
return self.added_tokens_encoder[index]
|
||||
else:
|
||||
# assert index > self.offset, f"cannot decode ids between 2 and {self.offset}. Got {index}"
|
||||
token = self.sp_model.IdToPiece(index - self.offset)
|
||||
return token
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.sp_model) + self.offset
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
""" Converts a sequence of tokens (string) in a single string. """
|
||||
out_string = self.sp_model.decode_pieces(tokens)
|
||||
return out_string
|
||||
|
||||
def num_special_tokens_to_add(self, pair=False):
|
||||
"""Just EOS"""
|
||||
@@ -88,7 +207,11 @@ class PegasusTokenizer(ReformerTokenizer):
|
||||
def _special_token_mask(self, seq):
|
||||
all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp
|
||||
all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
|
||||
assert all_special_ids == set([0, 1])
|
||||
|
||||
assert all_special_ids == set(
|
||||
range(len(self.additional_special_tokens) + 3)
|
||||
), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
|
||||
|
||||
return [1 if x in all_special_ids else 0 for x in seq]
|
||||
|
||||
def get_special_tokens_mask(
|
||||
@@ -105,7 +228,7 @@ class PegasusTokenizer(ReformerTokenizer):
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
|
||||
"""
|
||||
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
|
||||
and adding special tokens. A Pegasus sequence has the following format, where ``X`` represents the sequence:
|
||||
and adding special tokens. A PEGASUS sequence has the following format, where ``X`` represents the sequence:
|
||||
|
||||
- single sequence: ``X </s>``
|
||||
- pair of sequences: ``A B </s>`` (not intended use)
|
||||
@@ -156,3 +279,16 @@ class PegasusTokenizer(ReformerTokenizer):
|
||||
labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
|
||||
model_inputs["labels"] = labels
|
||||
return model_inputs
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||
return
|
||||
out_vocab_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||
)
|
||||
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
|
||||
return (out_vocab_file,)
|
||||
|
||||
@@ -12,11 +12,17 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import List, Optional
|
||||
""" Tokenization class for model PEGASUS."""
|
||||
|
||||
|
||||
import os
|
||||
from shutil import copyfile
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from ...file_utils import add_start_docstrings, is_sentencepiece_available
|
||||
from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
|
||||
from ..reformer.tokenization_reformer_fast import ReformerTokenizerFast
|
||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
if is_sentencepiece_available():
|
||||
@@ -25,6 +31,9 @@ else:
|
||||
PegasusTokenizer = None
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
SPIECE_UNDERLINE = "▁"
|
||||
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
|
||||
@@ -39,21 +48,112 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
}
|
||||
|
||||
|
||||
class PegasusTokenizerFast(ReformerTokenizerFast):
|
||||
class PegasusTokenizerFast(PreTrainedTokenizerFast):
|
||||
r"""
|
||||
Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
|
||||
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
|
||||
Users should refer to this superclass for more information regarding those methods.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
|
||||
contains the vocabulary necessary to instantiate a tokenizer.
|
||||
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||
The end of sequence token.
|
||||
|
||||
.. note::
|
||||
|
||||
When building a sequence using special tokens, this is not the token that is used for the end of
|
||||
sequence. The token used is the :obj:`sep_token`.
|
||||
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
|
||||
The token used for masking single token values. This is the token used when training this model with masked
|
||||
language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
|
||||
It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
|
||||
Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
|
||||
mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
|
||||
The token used for masking whole target sentences. This is the token used when training this model with gap
|
||||
sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
|
||||
pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
|
||||
Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`):
|
||||
Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
|
||||
<unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
|
||||
tokenizer
|
||||
<https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
|
||||
that uses the tokens 2 - 104 only for pretraining
|
||||
"""
|
||||
offset = 103 # entries 2-104 are only used for pretraining
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
slow_tokenizer_class = PegasusTokenizer
|
||||
model_input_names = ["attention_mask"]
|
||||
|
||||
# def num_special_tokens_to_add(self, pair=False):
|
||||
# """Just EOS"""
|
||||
# return 1
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
tokenizer_file=None,
|
||||
pad_token="<pad>",
|
||||
eos_token="</s>",
|
||||
unk_token="<unk>",
|
||||
mask_token="<mask_2>",
|
||||
mask_token_sent="<mask_1>",
|
||||
additional_special_tokens=None,
|
||||
**kwargs
|
||||
):
|
||||
if additional_special_tokens is not None:
|
||||
assert isinstance(
|
||||
additional_special_tokens, list
|
||||
), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
|
||||
|
||||
additional_special_tokens_extended = (
|
||||
([mask_token_sent] + additional_special_tokens)
|
||||
if mask_token_sent not in additional_special_tokens
|
||||
else additional_special_tokens
|
||||
)
|
||||
# fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
|
||||
additional_special_tokens_extended += [
|
||||
f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
|
||||
]
|
||||
|
||||
if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
|
||||
raise ValueError(
|
||||
f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
|
||||
)
|
||||
additional_special_tokens = additional_special_tokens_extended
|
||||
else:
|
||||
additional_special_tokens = [mask_token_sent]
|
||||
additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
|
||||
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
pad_token=pad_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
mask_token=mask_token,
|
||||
mask_token_sent=mask_token_sent,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
def _special_token_mask(self, seq):
|
||||
all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp
|
||||
all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
|
||||
assert all_special_ids == set([0, 1])
|
||||
|
||||
assert all_special_ids == set(
|
||||
range(len(self.additional_special_tokens) + 3)
|
||||
), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
|
||||
|
||||
return [1 if x in all_special_ids else 0 for x in seq]
|
||||
|
||||
def get_special_tokens_mask(
|
||||
@@ -117,3 +217,16 @@ class PegasusTokenizerFast(ReformerTokenizerFast):
|
||||
labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
|
||||
model_inputs["labels"] = labels
|
||||
return model_inputs
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||
return
|
||||
out_vocab_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||
)
|
||||
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
|
||||
return (out_vocab_file,)
|
||||
|
||||
@@ -64,8 +64,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
|
||||
class ReformerTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
|
||||
<https://github.com/google/sentencepiece>`__ .
|
||||
Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
|
||||
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||
methods. Users should refer to this superclass for more information regarding those methods.
|
||||
|
||||
@@ -75,8 +75,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
|
||||
class T5TokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
|
||||
<https://github.com/google/sentencepiece>`__ .
|
||||
Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
|
||||
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||
methods. Users should refer to this superclass for more information regarding those methods.
|
||||
|
||||
@@ -66,8 +66,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
|
||||
:class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `SentencePiece
|
||||
<https://github.com/google/sentencepiece>`__.
|
||||
:class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `BPE
|
||||
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||
methods. Users should refer to this superclass for more information regarding those methods.
|
||||
|
||||
@@ -62,8 +62,8 @@ SEG_ID_PAD = 4
|
||||
|
||||
class XLNetTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
|
||||
<https://github.com/google/sentencepiece>`__.
|
||||
Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
|
||||
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||
methods. Users should refer to this superclass for more information regarding those methods.
|
||||
|
||||
@@ -26,21 +26,34 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
|
||||
@cached_property
|
||||
def pegasus_large_tokenizer(self):
|
||||
def _large_tokenizer(self):
|
||||
return PegasusTokenizer.from_pretrained("google/pegasus-large")
|
||||
|
||||
@unittest.skip("add_tokens does not work yet")
|
||||
def test_swap_special_token(self):
|
||||
pass
|
||||
|
||||
def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
|
||||
return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_input_output_texts(self, tokenizer):
|
||||
return ("This is a test", "This is a test")
|
||||
|
||||
def test_pegasus_large_tokenizer_settings(self):
|
||||
tokenizer = self.pegasus_large_tokenizer
|
||||
def test_mask_tokens_rust_pegasus(self):
|
||||
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
|
||||
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
|
||||
raw_input_str = "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important </s> <pad> <pad> <pad>"
|
||||
rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
|
||||
py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
|
||||
# TODO: (Thom, Patrick) - this fails because the rust tokenizer does not know about the <mask_1>, <mask_2>, and those <unk_token_x> yet
|
||||
self.assertListEqual(py_ids, rust_ids)
|
||||
|
||||
def test_large_mask_tokens(self):
|
||||
tokenizer = self._large_tokenizer
|
||||
# <mask_1> masks whole sentence while <mask_2> masks single word
|
||||
raw_input_str = "<mask_1> To ensure a <mask_2> flow of bank resolutions."
|
||||
desired_result = [2, 413, 615, 114, 3, 1971, 113, 1679, 10710, 107, 1]
|
||||
ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
|
||||
self.assertListEqual(desired_result, ids)
|
||||
|
||||
def test_large_tokenizer_settings(self):
|
||||
tokenizer = self._large_tokenizer
|
||||
# The tracebacks for the following asserts are **better** without messages or self.assertEqual
|
||||
assert tokenizer.vocab_size == 96103
|
||||
assert tokenizer.pad_token_id == 0
|
||||
@@ -48,20 +61,18 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
assert tokenizer.offset == 103
|
||||
assert tokenizer.unk_token_id == tokenizer.offset + 2 == 105
|
||||
assert tokenizer.unk_token == "<unk>"
|
||||
assert tokenizer.mask_token is None
|
||||
assert tokenizer.mask_token_id is None
|
||||
assert tokenizer.model_max_length == 1024
|
||||
raw_input_str = "To ensure a smooth flow of bank resolutions."
|
||||
desired_result = [413, 615, 114, 2291, 1971, 113, 1679, 10710, 107, 1]
|
||||
ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
|
||||
self.assertListEqual(desired_result, ids)
|
||||
assert tokenizer.convert_ids_to_tokens([0, 1, 2]) == ["<pad>", "</s>", "unk_2"]
|
||||
assert tokenizer.convert_ids_to_tokens([0, 1, 2, 3]) == ["<pad>", "</s>", "<mask_1>", "<mask_2>"]
|
||||
|
||||
@require_torch
|
||||
def test_pegasus_large_seq2seq_truncation(self):
|
||||
def test_large_seq2seq_truncation(self):
|
||||
src_texts = ["This is going to be way too long." * 150, "short example"]
|
||||
tgt_texts = ["not super long but more than 5 tokens", "tiny"]
|
||||
batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch(
|
||||
batch = self._large_tokenizer.prepare_seq2seq_batch(
|
||||
src_texts, tgt_texts=tgt_texts, max_target_length=5, return_tensors="pt"
|
||||
)
|
||||
assert batch.input_ids.shape == (2, 1024)
|
||||
|
||||
Reference in New Issue
Block a user