[Pegasus] Refactor Tokenizer (#8731)
* refactor * further refactor * fix the rest tomorrow * save intermediate * finish slow tokenizer * make more tests pass * finish refactor * fix comment * clean further * fix name * fix naming * Update src/transformers/models/reformer/tokenization_reformer.py * Apply suggestions from code review * Apply suggestions from code review * refactor * fix init tokenizers * refactor * improve convert * refactor * correct convert slow tokenizer * final fix for Pegasus Tok * remove ipdb * improve links
This commit is contained in:
committed by
GitHub
parent
36b60ce9e8
commit
5ced23dc84
@@ -547,10 +547,12 @@ class BertGenerationConverter(SpmConverter):
|
|||||||
class PegasusConverter(SpmConverter):
|
class PegasusConverter(SpmConverter):
|
||||||
def vocab(self, proto):
|
def vocab(self, proto):
|
||||||
vocab = [
|
vocab = [
|
||||||
(self.original_tokenizer.pad_token, 0),
|
(self.original_tokenizer.pad_token, 0.0),
|
||||||
(self.original_tokenizer.eos_token, 0),
|
(self.original_tokenizer.eos_token, 0.0),
|
||||||
|
(self.original_tokenizer.mask_token_sent, 0.0),
|
||||||
|
(self.original_tokenizer.mask_token, 0.0),
|
||||||
]
|
]
|
||||||
vocab += [(f"unk_{i}", -100) for i in range(2, 2 + self.original_tokenizer.offset)]
|
vocab += [(f"<unk_{i}>", -100.0) for i in range(2, self.original_tokenizer.offset)]
|
||||||
vocab += [(piece.piece, piece.score) for piece in proto.pieces[2:]]
|
vocab += [(piece.piece, piece.score) for piece in proto.pieces[2:]]
|
||||||
return vocab
|
return vocab
|
||||||
|
|
||||||
@@ -559,13 +561,10 @@ class PegasusConverter(SpmConverter):
|
|||||||
|
|
||||||
def post_processor(self):
|
def post_processor(self):
|
||||||
eos = self.original_tokenizer.eos_token
|
eos = self.original_tokenizer.eos_token
|
||||||
return processors.TemplateProcessing(
|
special_tokens = [
|
||||||
single=["$A", eos],
|
|
||||||
pair=["$A", "$B", eos],
|
|
||||||
special_tokens=[
|
|
||||||
(eos, self.original_tokenizer.eos_token_id),
|
(eos, self.original_tokenizer.eos_token_id),
|
||||||
],
|
]
|
||||||
)
|
return processors.TemplateProcessing(single=["$A", eos], pair=["$A", "$B", eos], special_tokens=special_tokens)
|
||||||
|
|
||||||
|
|
||||||
class T5Converter(SpmConverter):
|
class T5Converter(SpmConverter):
|
||||||
|
|||||||
@@ -71,10 +71,10 @@ SPIECE_UNDERLINE = "▁"
|
|||||||
|
|
||||||
class AlbertTokenizerFast(PreTrainedTokenizerFast):
|
class AlbertTokenizerFast(PreTrainedTokenizerFast):
|
||||||
"""
|
"""
|
||||||
Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
|
Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
|
||||||
<https://github.com/google/sentencepiece>`__. This tokenizer inherits from
|
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__. This tokenizer
|
||||||
:class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this
|
inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
|
||||||
superclass for more information regarding those methods
|
refer to this superclass for more information regarding those methods
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_file (:obj:`str`):
|
vocab_file (:obj:`str`):
|
||||||
|
|||||||
@@ -60,8 +60,8 @@ SPIECE_UNDERLINE = "▁"
|
|||||||
class CamembertTokenizerFast(PreTrainedTokenizerFast):
|
class CamembertTokenizerFast(PreTrainedTokenizerFast):
|
||||||
"""
|
"""
|
||||||
Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
|
Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
|
||||||
:class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `SentencePiece
|
:class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
|
||||||
<https://github.com/google/sentencepiece>`__.
|
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
|
||||||
|
|
||||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||||
methods. Users should refer to this superclass for more information regarding those methods.
|
methods. Users should refer to this superclass for more information regarding those methods.
|
||||||
|
|||||||
@@ -67,7 +67,8 @@ FAIRSEQ_LANGUAGE_CODES = [
|
|||||||
|
|
||||||
class MBartTokenizerFast(XLMRobertaTokenizerFast):
|
class MBartTokenizerFast(XLMRobertaTokenizerFast):
|
||||||
"""
|
"""
|
||||||
Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library).
|
Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). Based on `BPE
|
||||||
|
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
|
||||||
|
|
||||||
:class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds
|
:class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds
|
||||||
a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`.
|
a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`.
|
||||||
|
|||||||
@@ -12,11 +12,16 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from typing import Dict, List, Optional
|
import os
|
||||||
|
from shutil import copyfile
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import sentencepiece as spm
|
||||||
|
|
||||||
from ...file_utils import add_start_docstrings
|
from ...file_utils import add_start_docstrings
|
||||||
|
from ...tokenization_utils import PreTrainedTokenizer
|
||||||
from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
|
from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
|
||||||
from ..reformer.tokenization_reformer import ReformerTokenizer
|
from ...utils import logging
|
||||||
|
|
||||||
|
|
||||||
SPIECE_UNDERLINE = "▁"
|
SPIECE_UNDERLINE = "▁"
|
||||||
@@ -32,31 +37,145 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class PegasusTokenizer(ReformerTokenizer):
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PegasusTokenizer(PreTrainedTokenizer):
|
||||||
r"""
|
r"""
|
||||||
Construct a Pegasus tokenizer.
|
Construct a PEGASUS tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
|
||||||
|
|
||||||
:class:`~transformers.PegasusTokenizer` is identical to :class:`~transformers.ReformerTokenizer` and adds a new
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
|
||||||
:meth:`~transformers.PegasusTokenizer.prepare_seq2seq_batch`
|
Users should refer to this superclass for more information regarding those methods.
|
||||||
|
|
||||||
Refer to superclass :class:`~transformers.ReformerTokenizer` for usage examples and documentation concerning the
|
Args:
|
||||||
initialization parameters and other methods.
|
vocab_file (:obj:`str`):
|
||||||
|
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
|
||||||
|
contains the vocabulary necessary to instantiate a tokenizer.
|
||||||
|
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||||
|
The end of sequence token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the end of
|
||||||
|
sequence. The token used is the :obj:`sep_token`.
|
||||||
|
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
|
||||||
|
The token used for masking single token values. This is the token used when training this model with masked
|
||||||
|
language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
|
||||||
|
It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
|
||||||
|
Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
|
||||||
|
mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
|
||||||
|
The token used for masking whole target sentences. This is the token used when training this model with gap
|
||||||
|
sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
|
||||||
|
pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
|
||||||
|
Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
|
||||||
|
additional_special_tokens (:obj:`List[str]`, `optional`):
|
||||||
|
Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
|
||||||
|
<unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
|
||||||
|
tokenizer
|
||||||
|
<https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
|
||||||
|
that uses the tokens 2 - 104 only for pretraining
|
||||||
"""
|
"""
|
||||||
offset = 103 # entries 2-104 are only used for pretraining
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|
||||||
|
offset = 103 # entries 2 - 104 are only used for pretraining
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
|
model_input_names = ["attention_mask"]
|
||||||
|
|
||||||
def __init__(self, *args, pad_token="<pad>", **kwargs):
|
def __init__(
|
||||||
super().__init__(*args, **kwargs, pad_token="<pad>")
|
self,
|
||||||
# Don't use reserved words added_token_encoder, added_tokens_decoder because of
|
vocab_file,
|
||||||
# AssertionError: Non-consecutive added token '1' found. in from_pretrained
|
pad_token="<pad>",
|
||||||
assert len(self.added_tokens_decoder) == 0
|
eos_token="</s>",
|
||||||
self.encoder: Dict[int, str] = {0: self.pad_token, 1: self.eos_token}
|
unk_token="<unk>",
|
||||||
# entries 2-104 are only used for pretraining and called unk_2, ...unk_104
|
mask_token="<mask_2>",
|
||||||
self.encoder.update({i: f"unk_{i}" for i in range(2, self.offset + 2)})
|
mask_token_sent="<mask_1>",
|
||||||
|
additional_special_tokens=None,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
if additional_special_tokens is not None:
|
||||||
|
assert isinstance(
|
||||||
|
additional_special_tokens, list
|
||||||
|
), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
|
||||||
|
|
||||||
|
additional_special_tokens_extended = (
|
||||||
|
([mask_token_sent] + additional_special_tokens)
|
||||||
|
if mask_token_sent not in additional_special_tokens
|
||||||
|
else additional_special_tokens
|
||||||
|
)
|
||||||
|
# fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
|
||||||
|
additional_special_tokens_extended += [
|
||||||
|
f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
|
||||||
|
]
|
||||||
|
|
||||||
|
if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
|
||||||
|
raise ValueError(
|
||||||
|
f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
|
||||||
|
)
|
||||||
|
additional_special_tokens = additional_special_tokens_extended
|
||||||
|
else:
|
||||||
|
additional_special_tokens = [mask_token_sent]
|
||||||
|
additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
|
||||||
|
|
||||||
|
super().__init__(
|
||||||
|
eos_token=eos_token,
|
||||||
|
unk_token=unk_token,
|
||||||
|
mask_token=mask_token,
|
||||||
|
pad_token=pad_token,
|
||||||
|
mask_token_sent=mask_token_sent,
|
||||||
|
additional_special_tokens=additional_special_tokens,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
self.vocab_file = vocab_file
|
||||||
|
self.sp_model = spm.SentencePieceProcessor()
|
||||||
|
self.sp_model.Load(vocab_file)
|
||||||
|
self.mask_token_sent = mask_token_sent
|
||||||
|
|
||||||
|
# add special tokens to encoder dict
|
||||||
|
self.encoder: Dict[int, str] = {
|
||||||
|
0: self.pad_token,
|
||||||
|
1: self.eos_token,
|
||||||
|
2: self.mask_token_sent,
|
||||||
|
3: self.mask_token,
|
||||||
|
}
|
||||||
|
# entries 2-104 are only used for pretraining and called <mask_1>, <mask_2>, unk_2, ...unk_102
|
||||||
|
# mask_token_sent is already added to list -> so start at 1
|
||||||
|
self.encoder.update({i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1)})
|
||||||
self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
|
self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self) -> int:
|
||||||
|
return len(self.sp_model) + self.offset
|
||||||
|
|
||||||
|
def get_vocab(self) -> Dict[str, int]:
|
||||||
|
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||||
|
vocab.update(self.added_tokens_encoder)
|
||||||
|
return vocab
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
state = self.__dict__.copy()
|
||||||
|
state["sp_model"] = None
|
||||||
|
return state
|
||||||
|
|
||||||
|
def __setstate__(self, d):
|
||||||
|
self.__dict__ = d
|
||||||
|
self.sp_model = spm.SentencePieceProcessor()
|
||||||
|
self.sp_model.Load(self.vocab_file)
|
||||||
|
|
||||||
|
def _tokenize(self, text, sample=False):
|
||||||
|
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
|
||||||
|
if not sample:
|
||||||
|
pieces = self.sp_model.EncodeAsPieces(text)
|
||||||
|
else:
|
||||||
|
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
|
||||||
|
return pieces
|
||||||
|
|
||||||
def _convert_token_to_id(self, token: str) -> int:
|
def _convert_token_to_id(self, token: str) -> int:
|
||||||
""" Converts a token (str) to an id using the vocab. """
|
""" Converts a token (str) to an id using the vocab. """
|
||||||
if token in self.decoder:
|
if token in self.decoder:
|
||||||
@@ -73,13 +192,13 @@ class PegasusTokenizer(ReformerTokenizer):
|
|||||||
elif index in self.added_tokens_encoder:
|
elif index in self.added_tokens_encoder:
|
||||||
return self.added_tokens_encoder[index]
|
return self.added_tokens_encoder[index]
|
||||||
else:
|
else:
|
||||||
# assert index > self.offset, f"cannot decode ids between 2 and {self.offset}. Got {index}"
|
|
||||||
token = self.sp_model.IdToPiece(index - self.offset)
|
token = self.sp_model.IdToPiece(index - self.offset)
|
||||||
return token
|
return token
|
||||||
|
|
||||||
@property
|
def convert_tokens_to_string(self, tokens):
|
||||||
def vocab_size(self) -> int:
|
""" Converts a sequence of tokens (string) in a single string. """
|
||||||
return len(self.sp_model) + self.offset
|
out_string = self.sp_model.decode_pieces(tokens)
|
||||||
|
return out_string
|
||||||
|
|
||||||
def num_special_tokens_to_add(self, pair=False):
|
def num_special_tokens_to_add(self, pair=False):
|
||||||
"""Just EOS"""
|
"""Just EOS"""
|
||||||
@@ -88,7 +207,11 @@ class PegasusTokenizer(ReformerTokenizer):
|
|||||||
def _special_token_mask(self, seq):
|
def _special_token_mask(self, seq):
|
||||||
all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp
|
all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp
|
||||||
all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
|
all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
|
||||||
assert all_special_ids == set([0, 1])
|
|
||||||
|
assert all_special_ids == set(
|
||||||
|
range(len(self.additional_special_tokens) + 3)
|
||||||
|
), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
|
||||||
|
|
||||||
return [1 if x in all_special_ids else 0 for x in seq]
|
return [1 if x in all_special_ids else 0 for x in seq]
|
||||||
|
|
||||||
def get_special_tokens_mask(
|
def get_special_tokens_mask(
|
||||||
@@ -105,7 +228,7 @@ class PegasusTokenizer(ReformerTokenizer):
|
|||||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
|
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
|
||||||
and adding special tokens. A Pegasus sequence has the following format, where ``X`` represents the sequence:
|
and adding special tokens. A PEGASUS sequence has the following format, where ``X`` represents the sequence:
|
||||||
|
|
||||||
- single sequence: ``X </s>``
|
- single sequence: ``X </s>``
|
||||||
- pair of sequences: ``A B </s>`` (not intended use)
|
- pair of sequences: ``A B </s>`` (not intended use)
|
||||||
@@ -156,3 +279,16 @@ class PegasusTokenizer(ReformerTokenizer):
|
|||||||
labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
|
labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
|
||||||
model_inputs["labels"] = labels
|
model_inputs["labels"] = labels
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
|
if not os.path.isdir(save_directory):
|
||||||
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
|
return
|
||||||
|
out_vocab_file = os.path.join(
|
||||||
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
||||||
|
copyfile(self.vocab_file, out_vocab_file)
|
||||||
|
|
||||||
|
return (out_vocab_file,)
|
||||||
|
|||||||
@@ -12,11 +12,17 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from typing import List, Optional
|
""" Tokenization class for model PEGASUS."""
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
from shutil import copyfile
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from ...file_utils import add_start_docstrings, is_sentencepiece_available
|
from ...file_utils import add_start_docstrings, is_sentencepiece_available
|
||||||
from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
|
from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
|
||||||
from ..reformer.tokenization_reformer_fast import ReformerTokenizerFast
|
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||||
|
from ...utils import logging
|
||||||
|
|
||||||
|
|
||||||
if is_sentencepiece_available():
|
if is_sentencepiece_available():
|
||||||
@@ -25,6 +31,9 @@ else:
|
|||||||
PegasusTokenizer = None
|
PegasusTokenizer = None
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
SPIECE_UNDERLINE = "▁"
|
SPIECE_UNDERLINE = "▁"
|
||||||
|
|
||||||
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
|
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
|
||||||
@@ -39,21 +48,112 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class PegasusTokenizerFast(ReformerTokenizerFast):
|
class PegasusTokenizerFast(PreTrainedTokenizerFast):
|
||||||
|
r"""
|
||||||
|
Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
|
||||||
|
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
|
||||||
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
|
||||||
|
Users should refer to this superclass for more information regarding those methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`str`):
|
||||||
|
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
|
||||||
|
contains the vocabulary necessary to instantiate a tokenizer.
|
||||||
|
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||||
|
The end of sequence token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the end of
|
||||||
|
sequence. The token used is the :obj:`sep_token`.
|
||||||
|
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
|
||||||
|
The token used for masking single token values. This is the token used when training this model with masked
|
||||||
|
language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
|
||||||
|
It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
|
||||||
|
Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
|
||||||
|
mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
|
||||||
|
The token used for masking whole target sentences. This is the token used when training this model with gap
|
||||||
|
sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
|
||||||
|
pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
|
||||||
|
Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
|
||||||
|
additional_special_tokens (:obj:`List[str]`, `optional`):
|
||||||
|
Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
|
||||||
|
<unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
|
||||||
|
tokenizer
|
||||||
|
<https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
|
||||||
|
that uses the tokens 2 - 104 only for pretraining
|
||||||
|
"""
|
||||||
offset = 103 # entries 2-104 are only used for pretraining
|
offset = 103 # entries 2-104 are only used for pretraining
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||||
slow_tokenizer_class = PegasusTokenizer
|
slow_tokenizer_class = PegasusTokenizer
|
||||||
|
model_input_names = ["attention_mask"]
|
||||||
|
|
||||||
# def num_special_tokens_to_add(self, pair=False):
|
def __init__(
|
||||||
# """Just EOS"""
|
self,
|
||||||
# return 1
|
vocab_file,
|
||||||
|
tokenizer_file=None,
|
||||||
|
pad_token="<pad>",
|
||||||
|
eos_token="</s>",
|
||||||
|
unk_token="<unk>",
|
||||||
|
mask_token="<mask_2>",
|
||||||
|
mask_token_sent="<mask_1>",
|
||||||
|
additional_special_tokens=None,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
if additional_special_tokens is not None:
|
||||||
|
assert isinstance(
|
||||||
|
additional_special_tokens, list
|
||||||
|
), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
|
||||||
|
|
||||||
|
additional_special_tokens_extended = (
|
||||||
|
([mask_token_sent] + additional_special_tokens)
|
||||||
|
if mask_token_sent not in additional_special_tokens
|
||||||
|
else additional_special_tokens
|
||||||
|
)
|
||||||
|
# fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
|
||||||
|
additional_special_tokens_extended += [
|
||||||
|
f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
|
||||||
|
]
|
||||||
|
|
||||||
|
if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
|
||||||
|
raise ValueError(
|
||||||
|
f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
|
||||||
|
)
|
||||||
|
additional_special_tokens = additional_special_tokens_extended
|
||||||
|
else:
|
||||||
|
additional_special_tokens = [mask_token_sent]
|
||||||
|
additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
|
||||||
|
|
||||||
|
super().__init__(
|
||||||
|
vocab_file,
|
||||||
|
tokenizer_file=tokenizer_file,
|
||||||
|
pad_token=pad_token,
|
||||||
|
eos_token=eos_token,
|
||||||
|
unk_token=unk_token,
|
||||||
|
mask_token=mask_token,
|
||||||
|
mask_token_sent=mask_token_sent,
|
||||||
|
additional_special_tokens=additional_special_tokens,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.vocab_file = vocab_file
|
||||||
|
|
||||||
def _special_token_mask(self, seq):
|
def _special_token_mask(self, seq):
|
||||||
all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp
|
all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp
|
||||||
all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
|
all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
|
||||||
assert all_special_ids == set([0, 1])
|
|
||||||
|
assert all_special_ids == set(
|
||||||
|
range(len(self.additional_special_tokens) + 3)
|
||||||
|
), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
|
||||||
|
|
||||||
return [1 if x in all_special_ids else 0 for x in seq]
|
return [1 if x in all_special_ids else 0 for x in seq]
|
||||||
|
|
||||||
def get_special_tokens_mask(
|
def get_special_tokens_mask(
|
||||||
@@ -117,3 +217,16 @@ class PegasusTokenizerFast(ReformerTokenizerFast):
|
|||||||
labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
|
labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
|
||||||
model_inputs["labels"] = labels
|
model_inputs["labels"] = labels
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
|
if not os.path.isdir(save_directory):
|
||||||
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
|
return
|
||||||
|
out_vocab_file = os.path.join(
|
||||||
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
||||||
|
copyfile(self.vocab_file, out_vocab_file)
|
||||||
|
|
||||||
|
return (out_vocab_file,)
|
||||||
|
|||||||
@@ -64,8 +64,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
|
|
||||||
class ReformerTokenizerFast(PreTrainedTokenizerFast):
|
class ReformerTokenizerFast(PreTrainedTokenizerFast):
|
||||||
"""
|
"""
|
||||||
Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
|
Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
|
||||||
<https://github.com/google/sentencepiece>`__ .
|
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
|
||||||
|
|
||||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||||
methods. Users should refer to this superclass for more information regarding those methods.
|
methods. Users should refer to this superclass for more information regarding those methods.
|
||||||
|
|||||||
@@ -75,8 +75,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
|
|
||||||
class T5TokenizerFast(PreTrainedTokenizerFast):
|
class T5TokenizerFast(PreTrainedTokenizerFast):
|
||||||
"""
|
"""
|
||||||
Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
|
Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
|
||||||
<https://github.com/google/sentencepiece>`__ .
|
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
|
||||||
|
|
||||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||||
methods. Users should refer to this superclass for more information regarding those methods.
|
methods. Users should refer to this superclass for more information regarding those methods.
|
||||||
|
|||||||
@@ -66,8 +66,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
|
class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
|
||||||
"""
|
"""
|
||||||
Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
|
Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
|
||||||
:class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `SentencePiece
|
:class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `BPE
|
||||||
<https://github.com/google/sentencepiece>`__.
|
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
|
||||||
|
|
||||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||||
methods. Users should refer to this superclass for more information regarding those methods.
|
methods. Users should refer to this superclass for more information regarding those methods.
|
||||||
|
|||||||
@@ -62,8 +62,8 @@ SEG_ID_PAD = 4
|
|||||||
|
|
||||||
class XLNetTokenizerFast(PreTrainedTokenizerFast):
|
class XLNetTokenizerFast(PreTrainedTokenizerFast):
|
||||||
"""
|
"""
|
||||||
Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
|
Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
|
||||||
<https://github.com/google/sentencepiece>`__.
|
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
|
||||||
|
|
||||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||||
methods. Users should refer to this superclass for more information regarding those methods.
|
methods. Users should refer to this superclass for more information regarding those methods.
|
||||||
|
|||||||
@@ -26,21 +26,34 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def pegasus_large_tokenizer(self):
|
def _large_tokenizer(self):
|
||||||
return PegasusTokenizer.from_pretrained("google/pegasus-large")
|
return PegasusTokenizer.from_pretrained("google/pegasus-large")
|
||||||
|
|
||||||
@unittest.skip("add_tokens does not work yet")
|
|
||||||
def test_swap_special_token(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
|
def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
|
||||||
return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
return ("This is a test", "This is a test")
|
return ("This is a test", "This is a test")
|
||||||
|
|
||||||
def test_pegasus_large_tokenizer_settings(self):
|
def test_mask_tokens_rust_pegasus(self):
|
||||||
tokenizer = self.pegasus_large_tokenizer
|
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
|
||||||
|
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
|
||||||
|
raw_input_str = "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important </s> <pad> <pad> <pad>"
|
||||||
|
rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
|
||||||
|
py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
|
||||||
|
# TODO: (Thom, Patrick) - this fails because the rust tokenizer does not know about the <mask_1>, <mask_2>, and those <unk_token_x> yet
|
||||||
|
self.assertListEqual(py_ids, rust_ids)
|
||||||
|
|
||||||
|
def test_large_mask_tokens(self):
|
||||||
|
tokenizer = self._large_tokenizer
|
||||||
|
# <mask_1> masks whole sentence while <mask_2> masks single word
|
||||||
|
raw_input_str = "<mask_1> To ensure a <mask_2> flow of bank resolutions."
|
||||||
|
desired_result = [2, 413, 615, 114, 3, 1971, 113, 1679, 10710, 107, 1]
|
||||||
|
ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
|
||||||
|
self.assertListEqual(desired_result, ids)
|
||||||
|
|
||||||
|
def test_large_tokenizer_settings(self):
|
||||||
|
tokenizer = self._large_tokenizer
|
||||||
# The tracebacks for the following asserts are **better** without messages or self.assertEqual
|
# The tracebacks for the following asserts are **better** without messages or self.assertEqual
|
||||||
assert tokenizer.vocab_size == 96103
|
assert tokenizer.vocab_size == 96103
|
||||||
assert tokenizer.pad_token_id == 0
|
assert tokenizer.pad_token_id == 0
|
||||||
@@ -48,20 +61,18 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
assert tokenizer.offset == 103
|
assert tokenizer.offset == 103
|
||||||
assert tokenizer.unk_token_id == tokenizer.offset + 2 == 105
|
assert tokenizer.unk_token_id == tokenizer.offset + 2 == 105
|
||||||
assert tokenizer.unk_token == "<unk>"
|
assert tokenizer.unk_token == "<unk>"
|
||||||
assert tokenizer.mask_token is None
|
|
||||||
assert tokenizer.mask_token_id is None
|
|
||||||
assert tokenizer.model_max_length == 1024
|
assert tokenizer.model_max_length == 1024
|
||||||
raw_input_str = "To ensure a smooth flow of bank resolutions."
|
raw_input_str = "To ensure a smooth flow of bank resolutions."
|
||||||
desired_result = [413, 615, 114, 2291, 1971, 113, 1679, 10710, 107, 1]
|
desired_result = [413, 615, 114, 2291, 1971, 113, 1679, 10710, 107, 1]
|
||||||
ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
|
ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
|
||||||
self.assertListEqual(desired_result, ids)
|
self.assertListEqual(desired_result, ids)
|
||||||
assert tokenizer.convert_ids_to_tokens([0, 1, 2]) == ["<pad>", "</s>", "unk_2"]
|
assert tokenizer.convert_ids_to_tokens([0, 1, 2, 3]) == ["<pad>", "</s>", "<mask_1>", "<mask_2>"]
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
def test_pegasus_large_seq2seq_truncation(self):
|
def test_large_seq2seq_truncation(self):
|
||||||
src_texts = ["This is going to be way too long." * 150, "short example"]
|
src_texts = ["This is going to be way too long." * 150, "short example"]
|
||||||
tgt_texts = ["not super long but more than 5 tokens", "tiny"]
|
tgt_texts = ["not super long but more than 5 tokens", "tiny"]
|
||||||
batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch(
|
batch = self._large_tokenizer.prepare_seq2seq_batch(
|
||||||
src_texts, tgt_texts=tgt_texts, max_target_length=5, return_tensors="pt"
|
src_texts, tgt_texts=tgt_texts, max_target_length=5, return_tensors="pt"
|
||||||
)
|
)
|
||||||
assert batch.input_ids.shape == (2, 1024)
|
assert batch.input_ids.shape == (2, 1024)
|
||||||
|
|||||||
Reference in New Issue
Block a user