Merge pull request #1384 from huggingface/encoding-qol
Quality of life enhancements in encoding + patch MLM masking
This commit is contained in:
@@ -9,7 +9,7 @@ similar API between the different models.
|
|||||||
| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. |
|
| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. |
|
||||||
| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. |
|
| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. |
|
||||||
| [SQuAD](#squad) | Using BERT for question answering, examples with distributed training. |
|
| [SQuAD](#squad) | Using BERT for question answering, examples with distributed training. |
|
||||||
| [Multiple Choice](#multiple choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
|
| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks.
|
||||||
|
|
||||||
## Language model fine-tuning
|
## Language model fine-tuning
|
||||||
|
|
||||||
@@ -283,17 +283,17 @@ The results are the following:
|
|||||||
loss = 0.04755385363816904
|
loss = 0.04755385363816904
|
||||||
```
|
```
|
||||||
|
|
||||||
##Multiple Choice
|
## Multiple Choice
|
||||||
|
|
||||||
Based on the script [`run_multiple_choice.py`]().
|
Based on the script [`run_multiple_choice.py`]().
|
||||||
|
|
||||||
#### Fine-tuning on SWAG
|
#### Fine-tuning on SWAG
|
||||||
Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
|
Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
|
||||||
|
|
||||||
```
|
```bash
|
||||||
#training on 4 tesla V100(16GB) GPUS
|
#training on 4 tesla V100(16GB) GPUS
|
||||||
export SWAG_DIR=/path/to/swag_data_dir
|
export SWAG_DIR=/path/to/swag_data_dir
|
||||||
python ./examples/single_model_scripts/run_multiple_choice.py \
|
python ./examples/run_multiple_choice.py \
|
||||||
--model_type roberta \
|
--model_type roberta \
|
||||||
--task_name swag \
|
--task_name swag \
|
||||||
--model_name_or_path roberta-base \
|
--model_name_or_path roberta-base \
|
||||||
|
|||||||
@@ -271,7 +271,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
|||||||
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||||
str(args.max_seq_length),
|
str(args.max_seq_length),
|
||||||
str(task)))
|
str(task)))
|
||||||
if os.path.exists(cached_features_file):
|
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||||
logger.info("Loading features from cached file %s", cached_features_file)
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
features = torch.load(cached_features_file)
|
features = torch.load(cached_features_file)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ class TextDataset(Dataset):
|
|||||||
def __init__(self, tokenizer, file_path='train', block_size=512):
|
def __init__(self, tokenizer, file_path='train', block_size=512):
|
||||||
assert os.path.isfile(file_path)
|
assert os.path.isfile(file_path)
|
||||||
directory, filename = os.path.split(file_path)
|
directory, filename = os.path.split(file_path)
|
||||||
cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename))
|
cached_features_file = os.path.join(directory, 'cached_lm_' + block_size + '_' + filename)
|
||||||
|
|
||||||
if os.path.exists(cached_features_file):
|
if os.path.exists(cached_features_file):
|
||||||
logger.info("Loading features from cached file %s", cached_features_file)
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
@@ -77,7 +77,7 @@ class TextDataset(Dataset):
|
|||||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
||||||
|
|
||||||
for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
|
for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
|
||||||
self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[i:i+block_size]))
|
self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
|
||||||
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
|
||||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||||
# can change this behavior by adding (model specific) padding.
|
# can change this behavior by adding (model specific) padding.
|
||||||
@@ -139,7 +139,10 @@ def mask_tokens(inputs, tokenizer, args):
|
|||||||
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
|
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
|
||||||
labels = inputs.clone()
|
labels = inputs.clone()
|
||||||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
||||||
masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).bool()
|
probability_matrix = torch.full(labels.shape, args.mlm_probability)
|
||||||
|
special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
|
||||||
|
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
|
||||||
|
masked_indices = torch.bernoulli(probability_matrix).bool()
|
||||||
labels[~masked_indices] = -1 # We only compute loss on masked tokens
|
labels[~masked_indices] = -1 # We only compute loss on masked tokens
|
||||||
|
|
||||||
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
|
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
|
||||||
|
|||||||
@@ -293,7 +293,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
|
|||||||
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
list(filter(None, args.model_name_or_path.split('/'))).pop(),
|
||||||
str(args.max_seq_length),
|
str(args.max_seq_length),
|
||||||
str(task)))
|
str(task)))
|
||||||
if os.path.exists(cached_features_file):
|
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||||
logger.info("Loading features from cached file %s", cached_features_file)
|
logger.info("Loading features from cached file %s", cached_features_file)
|
||||||
features = torch.load(cached_features_file)
|
features = torch.load(cached_features_file)
|
||||||
else:
|
else:
|
||||||
@@ -306,14 +306,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
|
|||||||
else:
|
else:
|
||||||
examples = processor.get_train_examples(args.data_dir)
|
examples = processor.get_train_examples(args.data_dir)
|
||||||
logger.info("Training number: %s", str(len(examples)))
|
logger.info("Training number: %s", str(len(examples)))
|
||||||
features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer,
|
features = convert_examples_to_features(
|
||||||
cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end
|
examples,
|
||||||
cls_token=tokenizer.cls_token,
|
label_list,
|
||||||
sep_token=tokenizer.sep_token,
|
args.max_seq_length,
|
||||||
sep_token_extra=bool(args.model_type in ['roberta']),
|
tokenizer,
|
||||||
cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
|
|
||||||
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
|
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
|
||||||
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
|
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
|
||||||
|
)
|
||||||
if args.local_rank in [-1, 0]:
|
if args.local_rank in [-1, 0]:
|
||||||
logger.info("Saving features into cached file %s", cached_features_file)
|
logger.info("Saving features into cached file %s", cached_features_file)
|
||||||
torch.save(features, cached_features_file)
|
torch.save(features, cached_features_file)
|
||||||
@@ -362,7 +362,7 @@ def main():
|
|||||||
help="Whether to run eval on the dev set.")
|
help="Whether to run eval on the dev set.")
|
||||||
parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
|
parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
|
||||||
parser.add_argument("--evaluate_during_training", action='store_true',
|
parser.add_argument("--evaluate_during_training", action='store_true',
|
||||||
help="Rul evaluation during training at each logging step.")
|
help="Run evaluation during training at each logging step.")
|
||||||
parser.add_argument("--do_lower_case", action='store_true',
|
parser.add_argument("--do_lower_case", action='store_true',
|
||||||
help="Set this flag if you are using an uncased model.")
|
help="Set this flag if you are using an uncased model.")
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" BERT multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """
|
""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
@@ -26,6 +26,8 @@ import json
|
|||||||
import csv
|
import csv
|
||||||
import glob
|
import glob
|
||||||
import tqdm
|
import tqdm
|
||||||
|
from typing import List
|
||||||
|
from transformers import PreTrainedTokenizer
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -34,13 +36,13 @@ logger = logging.getLogger(__name__)
|
|||||||
class InputExample(object):
|
class InputExample(object):
|
||||||
"""A single training/test example for multiple choice"""
|
"""A single training/test example for multiple choice"""
|
||||||
|
|
||||||
def __init__(self, example_id, question, contexts, endings, label=None):
|
def __init__(self, example_id, question, contexts, endings, label=None):
|
||||||
"""Constructs a InputExample.
|
"""Constructs a InputExample.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
example_id: Unique id for the example.
|
example_id: Unique id for the example.
|
||||||
contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
|
contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
|
||||||
question: string. The untokenized text of the second sequence (qustion).
|
question: string. The untokenized text of the second sequence (question).
|
||||||
endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
|
endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
|
||||||
label: (Optional) string. The label of the example. This should be
|
label: (Optional) string. The label of the example. This should be
|
||||||
specified for train and dev examples, but not for test examples.
|
specified for train and dev examples, but not for test examples.
|
||||||
@@ -66,7 +68,7 @@ class InputFeatures(object):
|
|||||||
'input_mask': input_mask,
|
'input_mask': input_mask,
|
||||||
'segment_ids': segment_ids
|
'segment_ids': segment_ids
|
||||||
}
|
}
|
||||||
for _, input_ids, input_mask, segment_ids in choices_features
|
for input_ids, input_mask, segment_ids in choices_features
|
||||||
]
|
]
|
||||||
self.label = label
|
self.label = label
|
||||||
|
|
||||||
@@ -192,7 +194,7 @@ class SwagProcessor(DataProcessor):
|
|||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
def _create_examples(self, lines, type):
|
def _create_examples(self, lines: List[List[str]], type: str):
|
||||||
"""Creates examples for the training and dev sets."""
|
"""Creates examples for the training and dev sets."""
|
||||||
if type == "train" and lines[0][-1] != 'label':
|
if type == "train" and lines[0][-1] != 'label':
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -300,24 +302,18 @@ class ArcProcessor(DataProcessor):
|
|||||||
return examples
|
return examples
|
||||||
|
|
||||||
|
|
||||||
def convert_examples_to_features(examples, label_list, max_seq_length,
|
def convert_examples_to_features(
|
||||||
tokenizer,
|
examples: List[InputExample],
|
||||||
cls_token_at_end=False,
|
label_list: List[str],
|
||||||
cls_token='[CLS]',
|
max_length: int,
|
||||||
cls_token_segment_id=1,
|
tokenizer: PreTrainedTokenizer,
|
||||||
sep_token='[SEP]',
|
pad_token_segment_id=0,
|
||||||
sequence_a_segment_id=0,
|
pad_on_left=False,
|
||||||
sequence_b_segment_id=1,
|
pad_token=0,
|
||||||
sep_token_extra=False,
|
mask_padding_with_zero=True,
|
||||||
pad_token_segment_id=0,
|
) -> List[InputFeatures]:
|
||||||
pad_on_left=False,
|
"""
|
||||||
pad_token=0,
|
Loads a data file into a list of `InputFeatures`
|
||||||
mask_padding_with_zero=True):
|
|
||||||
""" Loads a data file into a list of `InputBatch`s
|
|
||||||
`cls_token_at_end` define the location of the CLS token:
|
|
||||||
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
|
|
||||||
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
|
|
||||||
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
label_map = {label : i for i, label in enumerate(label_list)}
|
label_map = {label : i for i, label in enumerate(label_list)}
|
||||||
@@ -328,125 +324,70 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
|
|||||||
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
|
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
|
||||||
choices_features = []
|
choices_features = []
|
||||||
for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
|
for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
|
||||||
tokens_a = tokenizer.tokenize(context)
|
text_a = context
|
||||||
tokens_b = None
|
|
||||||
if example.question.find("_") != -1:
|
if example.question.find("_") != -1:
|
||||||
#this is for cloze question
|
# this is for cloze question
|
||||||
tokens_b = tokenizer.tokenize(example.question.replace("_", ending))
|
text_b = example.question.replace("_", ending)
|
||||||
else:
|
else:
|
||||||
tokens_b = tokenizer.tokenize(example.question + " " + ending)
|
text_b = example.question + " " + ending
|
||||||
# you can add seq token between quesiotn and ending. This does not make too much difference.
|
|
||||||
# tokens_b = tokenizer.tokenize(example.question)
|
|
||||||
# tokens_b += [sep_token]
|
|
||||||
# if sep_token_extra:
|
|
||||||
# tokens_b += [sep_token]
|
|
||||||
# tokens_b += tokenizer.tokenize(ending)
|
|
||||||
|
|
||||||
special_tokens_count = 4 if sep_token_extra else 3
|
inputs = tokenizer.encode_plus(
|
||||||
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
|
text_a,
|
||||||
|
text_b,
|
||||||
|
add_special_tokens=True,
|
||||||
|
max_length=max_length,
|
||||||
|
)
|
||||||
|
if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
|
||||||
|
logger.info('Attention! you are cropping tokens (swag task is ok). '
|
||||||
|
'If you are training ARC and RACE and you are poping question + options,'
|
||||||
|
'you need to try to use a bigger max seq length!')
|
||||||
|
|
||||||
# The convention in BERT is:
|
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
|
||||||
# (a) For sequence pairs:
|
|
||||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
|
||||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
|
||||||
# (b) For single sequences:
|
|
||||||
# tokens: [CLS] the dog is hairy . [SEP]
|
|
||||||
# type_ids: 0 0 0 0 0 0 0
|
|
||||||
#
|
|
||||||
# Where "type_ids" are used to indicate whether this is the first
|
|
||||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
|
||||||
# `type=1` were learned during pre-training and are added to the wordpiece
|
|
||||||
# embedding vector (and position vector). This is not *strictly* necessary
|
|
||||||
# since the [SEP] token unambiguously separates the sequences, but it makes
|
|
||||||
# it easier for the model to learn the concept of sequences.
|
|
||||||
#
|
|
||||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
|
||||||
# used as as the "sentence vector". Note that this only makes sense because
|
|
||||||
# the entire model is fine-tuned.
|
|
||||||
tokens = tokens_a + [sep_token]
|
|
||||||
if sep_token_extra:
|
|
||||||
# roberta uses an extra separator b/w pairs of sentences
|
|
||||||
tokens += [sep_token]
|
|
||||||
|
|
||||||
segment_ids = [sequence_a_segment_id] * len(tokens)
|
|
||||||
|
|
||||||
if tokens_b:
|
|
||||||
tokens += tokens_b + [sep_token]
|
|
||||||
segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
|
|
||||||
|
|
||||||
if cls_token_at_end:
|
|
||||||
tokens = tokens + [cls_token]
|
|
||||||
segment_ids = segment_ids + [cls_token_segment_id]
|
|
||||||
else:
|
|
||||||
tokens = [cls_token] + tokens
|
|
||||||
segment_ids = [cls_token_segment_id] + segment_ids
|
|
||||||
|
|
||||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
||||||
|
|
||||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
||||||
# tokens are attended to.
|
# tokens are attended to.
|
||||||
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
|
attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
|
||||||
|
|
||||||
# Zero-pad up to the sequence length.
|
# Zero-pad up to the sequence length.
|
||||||
padding_length = max_seq_length - len(input_ids)
|
padding_length = max_length - len(input_ids)
|
||||||
if pad_on_left:
|
if pad_on_left:
|
||||||
input_ids = ([pad_token] * padding_length) + input_ids
|
input_ids = ([pad_token] * padding_length) + input_ids
|
||||||
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
|
attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
|
||||||
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
|
token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
|
||||||
else:
|
else:
|
||||||
input_ids = input_ids + ([pad_token] * padding_length)
|
input_ids = input_ids + ([pad_token] * padding_length)
|
||||||
input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
|
attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
|
||||||
segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
|
token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
|
||||||
|
|
||||||
|
assert len(input_ids) == max_length
|
||||||
|
assert len(attention_mask) == max_length
|
||||||
|
assert len(token_type_ids) == max_length
|
||||||
|
choices_features.append((input_ids, attention_mask, token_type_ids))
|
||||||
|
|
||||||
|
|
||||||
assert len(input_ids) == max_seq_length
|
|
||||||
assert len(input_mask) == max_seq_length
|
|
||||||
assert len(segment_ids) == max_seq_length
|
|
||||||
choices_features.append((tokens, input_ids, input_mask, segment_ids))
|
|
||||||
label = label_map[example.label]
|
label = label_map[example.label]
|
||||||
|
|
||||||
if ex_index < 2:
|
if ex_index < 2:
|
||||||
logger.info("*** Example ***")
|
logger.info("*** Example ***")
|
||||||
logger.info("race_id: {}".format(example.example_id))
|
logger.info("race_id: {}".format(example.example_id))
|
||||||
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
|
for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
|
||||||
logger.info("choice: {}".format(choice_idx))
|
logger.info("choice: {}".format(choice_idx))
|
||||||
logger.info("tokens: {}".format(' '.join(tokens)))
|
|
||||||
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
|
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
|
||||||
logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
|
logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask))))
|
||||||
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
|
logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids))))
|
||||||
logger.info("label: {}".format(label))
|
logger.info("label: {}".format(label))
|
||||||
|
|
||||||
features.append(
|
features.append(
|
||||||
InputFeatures(
|
InputFeatures(
|
||||||
example_id = example.example_id,
|
example_id=example.example_id,
|
||||||
choices_features = choices_features,
|
choices_features=choices_features,
|
||||||
label = label
|
label=label,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
return features
|
return features
|
||||||
|
|
||||||
|
|
||||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
|
|
||||||
"""Truncates a sequence pair in place to the maximum length."""
|
|
||||||
|
|
||||||
# This is a simple heuristic which will always truncate the longer sequence
|
|
||||||
# one token at a time. This makes more sense than truncating an equal percent
|
|
||||||
# of tokens from each, since if one sequence is very short then each token
|
|
||||||
# that's truncated likely contains more information than a longer sequence.
|
|
||||||
|
|
||||||
# However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger
|
|
||||||
# length or only pop from context
|
|
||||||
while True:
|
|
||||||
total_length = len(tokens_a) + len(tokens_b)
|
|
||||||
if total_length <= max_length:
|
|
||||||
break
|
|
||||||
if len(tokens_a) > len(tokens_b):
|
|
||||||
tokens_a.pop()
|
|
||||||
else:
|
|
||||||
logger.info('Attention! you are removing from token_b (swag task is ok). '
|
|
||||||
'If you are training ARC and RACE (you are poping question + options), '
|
|
||||||
'you need to try to use a bigger max seq length!')
|
|
||||||
tokens_b.pop()
|
|
||||||
|
|
||||||
|
|
||||||
processors = {
|
processors = {
|
||||||
@@ -456,7 +397,7 @@ processors = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
GLUE_TASKS_NUM_LABELS = {
|
MULTIPLE_CHOICE_TASKS_NUM_LABELS = {
|
||||||
"race", 4,
|
"race", 4,
|
||||||
"swag", 4,
|
"swag", 4,
|
||||||
"arc", 4
|
"arc", 4
|
||||||
|
|||||||
@@ -86,7 +86,6 @@ def glue_convert_examples_to_features(examples, tokenizer,
|
|||||||
example.text_b,
|
example.text_b,
|
||||||
add_special_tokens=True,
|
add_special_tokens=True,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
truncate_first_sequence=True # We're truncating the first sequence in priority
|
|
||||||
)
|
)
|
||||||
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
|
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
|
||||||
|
|
||||||
|
|||||||
@@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders")
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build")
|
||||||
|
|
||||||
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
assert encoded_sentence == [101] + text + [102]
|
assert encoded_sentence == [101] + text + [102]
|
||||||
assert encoded_pair == [101] + text + [102] + text_2 + [102]
|
assert encoded_pair == [101] + text + [102] + text_2 + [102]
|
||||||
|
|||||||
@@ -36,8 +36,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
|
|||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders")
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build")
|
||||||
|
|
||||||
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
|
assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
|
||||||
assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
|
assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
|
||||||
|
|||||||
@@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
|
encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
|
||||||
encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
|
encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
|
||||||
|
|
||||||
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
assert encoded_sentence == encoded_text_from_decode
|
assert encoded_sentence == encoded_text_from_decode
|
||||||
assert encoded_pair == encoded_pair_from_decode
|
assert encoded_pair == encoded_pair_from_decode
|
||||||
|
|||||||
@@ -193,12 +193,12 @@ class CommonTestCases:
|
|||||||
|
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
|
if tokenizer.build_inputs_with_special_tokens.__qualname__.split('.')[0] != "PreTrainedTokenizer":
|
||||||
seq_0 = "Test this method."
|
seq_0 = "Test this method."
|
||||||
seq_1 = "With these inputs."
|
seq_1 = "With these inputs."
|
||||||
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
|
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
|
||||||
sequences, mask = information["input_ids"], information["token_type_ids"]
|
sequences, mask = information["input_ids"], information["token_type_ids"]
|
||||||
assert len(sequences) == len(mask)
|
self.assertEqual(len(sequences), len(mask))
|
||||||
|
|
||||||
def test_number_of_added_tokens(self):
|
def test_number_of_added_tokens(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
@@ -211,7 +211,7 @@ class CommonTestCases:
|
|||||||
|
|
||||||
# Method is implemented (e.g. not GPT-2)
|
# Method is implemented (e.g. not GPT-2)
|
||||||
if len(attached_sequences) != 2:
|
if len(attached_sequences) != 2:
|
||||||
assert tokenizer.num_added_tokens(pair=True) == len(attached_sequences) - len(sequences)
|
self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
|
||||||
|
|
||||||
def test_maximum_encoding_length_single_input(self):
|
def test_maximum_encoding_length_single_input(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
@@ -227,10 +227,10 @@ class CommonTestCases:
|
|||||||
truncated_sequence = information["input_ids"]
|
truncated_sequence = information["input_ids"]
|
||||||
overflowing_tokens = information["overflowing_tokens"]
|
overflowing_tokens = information["overflowing_tokens"]
|
||||||
|
|
||||||
assert len(overflowing_tokens) == 2 + stride
|
self.assertEqual(len(overflowing_tokens), 2 + stride)
|
||||||
assert overflowing_tokens == sequence[-(2 + stride):]
|
self.assertEqual(overflowing_tokens, sequence[-(2 + stride):])
|
||||||
assert len(truncated_sequence) == total_length - 2
|
self.assertEqual(len(truncated_sequence), total_length - 2)
|
||||||
assert truncated_sequence == tokenizer.add_special_tokens_single_sequence(sequence[:-2])
|
self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
|
||||||
|
|
||||||
def test_maximum_encoding_length_pair_input(self):
|
def test_maximum_encoding_length_pair_input(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
@@ -243,26 +243,26 @@ class CommonTestCases:
|
|||||||
sequence_1_no_special_tokens = tokenizer.encode(seq_1)
|
sequence_1_no_special_tokens = tokenizer.encode(seq_1)
|
||||||
|
|
||||||
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
|
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
|
||||||
truncated_second_sequence = tokenizer.add_special_tokens_sequence_pair(
|
truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
|
||||||
tokenizer.encode(seq_0),
|
tokenizer.encode(seq_0),
|
||||||
tokenizer.encode(seq_1)[:-2]
|
tokenizer.encode(seq_1)[:-2]
|
||||||
)
|
)
|
||||||
|
|
||||||
information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
|
information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
|
||||||
stride=stride, truncate_first_sequence=False)
|
stride=stride, truncation_strategy='only_second')
|
||||||
information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
|
information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
|
||||||
add_special_tokens=True, stride=stride,
|
add_special_tokens=True, stride=stride,
|
||||||
truncate_first_sequence=True)
|
truncation_strategy='only_first')
|
||||||
|
|
||||||
truncated_sequence = information["input_ids"]
|
truncated_sequence = information["input_ids"]
|
||||||
overflowing_tokens = information["overflowing_tokens"]
|
overflowing_tokens = information["overflowing_tokens"]
|
||||||
overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
|
overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
|
||||||
|
|
||||||
assert len(overflowing_tokens) == 2 + stride
|
self.assertEqual(len(overflowing_tokens), 2 + stride)
|
||||||
assert overflowing_tokens == sequence_1_no_special_tokens[-(2 + stride):]
|
self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride):])
|
||||||
assert overflowing_tokens_first_truncated == sequence_0_no_special_tokens[-(2 + stride):]
|
self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride):])
|
||||||
assert len(truncated_sequence) == len(sequence) - 2
|
self.assertEqual(len(truncated_sequence), len(sequence) - 2)
|
||||||
assert truncated_sequence == truncated_second_sequence
|
self.assertEqual(truncated_sequence, truncated_second_sequence)
|
||||||
|
|
||||||
def test_encode_input_type(self):
|
def test_encode_input_type(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
@@ -273,5 +273,43 @@ class CommonTestCases:
|
|||||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||||
formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
|
formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
|
||||||
|
|
||||||
assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
|
self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
|
||||||
assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
|
self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
|
||||||
|
|
||||||
|
def test_special_tokens_mask(self):
|
||||||
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
|
sequence_0 = "Encode this."
|
||||||
|
sequence_1 = "This one too please."
|
||||||
|
|
||||||
|
# Testing single inputs
|
||||||
|
encoded_sequence = tokenizer.encode(sequence_0)
|
||||||
|
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
||||||
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||||
|
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||||
|
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
||||||
|
|
||||||
|
filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
||||||
|
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
||||||
|
self.assertEqual(encoded_sequence, filtered_sequence)
|
||||||
|
|
||||||
|
# Testing inputs pairs
|
||||||
|
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
|
||||||
|
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
|
||||||
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||||
|
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||||
|
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
||||||
|
|
||||||
|
filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
||||||
|
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
||||||
|
self.assertEqual(encoded_sequence, filtered_sequence)
|
||||||
|
|
||||||
|
# Testing with already existing special tokens
|
||||||
|
if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
|
||||||
|
tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
|
||||||
|
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
||||||
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||||
|
special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
|
||||||
|
special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True)
|
||||||
|
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
||||||
|
self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
|
||||||
|
|||||||
@@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders")
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build")
|
||||||
|
|
||||||
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
assert encoded_sentence == [1] + text + [1]
|
assert encoded_sentence == [1] + text + [1]
|
||||||
assert encoded_pair == [1] + text + [1] + text_2 + [1]
|
assert encoded_pair == [1] + text + [1] + text_2 + [1]
|
||||||
|
|||||||
@@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
text = tokenizer.encode("sequence builders")
|
text = tokenizer.encode("sequence builders")
|
||||||
text_2 = tokenizer.encode("multi-sequence build")
|
text_2 = tokenizer.encode("multi-sequence build")
|
||||||
|
|
||||||
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
||||||
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
||||||
|
|
||||||
assert encoded_sentence == text + [4, 3]
|
assert encoded_sentence == text + [4, 3]
|
||||||
assert encoded_pair == text + [4] + text_2 + [4, 3]
|
assert encoded_pair == text + [4] + text_2 + [4, 3]
|
||||||
|
|||||||
@@ -187,33 +187,59 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
out_string = ' '.join(tokens).replace(' ##', '').strip()
|
out_string = ' '.join(tokens).replace(' ##', '').strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to the a sequence for sequence classification tasks.
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
A BERT sequence has the following format: [CLS] X [SEP]
|
by concatenating and adding special tokens.
|
||||||
|
A BERT sequence has the following format:
|
||||||
|
single sequence: [CLS] X [SEP]
|
||||||
|
pair of sequences: [CLS] A [SEP] B [SEP]
|
||||||
"""
|
"""
|
||||||
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
if token_ids_1 is None:
|
||||||
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
|
|
||||||
"""
|
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
|
||||||
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
sep = [self.sep_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
|
"""
|
||||||
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0: list of ids (must not contain special tokens)
|
||||||
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||||
|
for sequence pairs
|
||||||
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
||||||
|
special tokens for the model
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if already_has_special_tokens:
|
||||||
|
if token_ids_1 is not None:
|
||||||
|
raise ValueError("You should not supply a second sequence if the provided sequence of "
|
||||||
|
"ids is already formated with special tokens for the model.")
|
||||||
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||||
|
|
||||||
|
if token_ids_1 is not None:
|
||||||
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A BERT sequence pair mask has the following format:
|
A BERT sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
| first sequence | second sequence
|
| first sequence | second sequence
|
||||||
|
|
||||||
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(cls + token_ids_0 + sep) * [0]
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, vocab_path):
|
def save_vocabulary(self, vocab_path):
|
||||||
|
|||||||
@@ -84,30 +84,57 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||||
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
|
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence for sequence classification tasks.
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
A RoBERTa sequence has the following format: <s> X </s>
|
by concatenating and adding special tokens.
|
||||||
|
A RoBERTa sequence has the following format:
|
||||||
|
single sequence: <s> X </s>
|
||||||
|
pair of sequences: <s> A </s></s> B </s>
|
||||||
"""
|
"""
|
||||||
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
if token_ids_1 is None:
|
||||||
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
|
|
||||||
"""
|
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
|
||||||
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
sep = [self.sep_token_id]
|
||||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
|
"""
|
||||||
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0: list of ids (must not contain special tokens)
|
||||||
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||||
|
for sequence pairs
|
||||||
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
||||||
|
special tokens for the model
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
|
"""
|
||||||
|
if already_has_special_tokens:
|
||||||
|
if token_ids_1 is not None:
|
||||||
|
raise ValueError("You should not supply a second sequence if the provided sequence of "
|
||||||
|
"ids is already formated with special tokens for the model.")
|
||||||
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||||
|
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
|
||||||
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A RoBERTa sequence pair mask has the following format:
|
A RoBERTa sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
| first sequence | second sequence
|
| first sequence | second sequence
|
||||||
|
|
||||||
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
|
if token_ids_1 is None:
|
||||||
|
return len(cls + token_ids_0 + sep) * [0]
|
||||||
|
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|||||||
@@ -539,15 +539,9 @@ class PreTrainedTokenizer(object):
|
|||||||
Returns:
|
Returns:
|
||||||
Number of tokens added to sequences
|
Number of tokens added to sequences
|
||||||
"""
|
"""
|
||||||
|
token_ids_0 = []
|
||||||
if pair:
|
token_ids_1 = []
|
||||||
initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another"))
|
return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
|
||||||
final_tokens_len = len(self.encode("This is a sequence", "This is another", add_special_tokens=True))
|
|
||||||
else:
|
|
||||||
initial_tokens_len = len(self.encode("This is a sequence"))
|
|
||||||
final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True))
|
|
||||||
|
|
||||||
return final_tokens_len - initial_tokens_len
|
|
||||||
|
|
||||||
def add_special_tokens(self, special_tokens_dict):
|
def add_special_tokens(self, special_tokens_dict):
|
||||||
"""
|
"""
|
||||||
@@ -699,7 +693,7 @@ class PreTrainedTokenizer(object):
|
|||||||
add_special_tokens=False,
|
add_special_tokens=False,
|
||||||
max_length=None,
|
max_length=None,
|
||||||
stride=0,
|
stride=0,
|
||||||
truncate_first_sequence=True,
|
truncation_strategy='longest_first',
|
||||||
return_tensors=None,
|
return_tensors=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""
|
"""
|
||||||
@@ -719,9 +713,13 @@ class PreTrainedTokenizer(object):
|
|||||||
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
|
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
|
||||||
If there are overflowing tokens, those will be added to the returned dictionary
|
If there are overflowing tokens, those will be added to the returned dictionary
|
||||||
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
|
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
|
||||||
from the main sequence returned. The value of this argument defined the number of additional tokens.
|
from the main sequence returned. The value of this argument defines the number of additional tokens.
|
||||||
truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
|
truncation_strategy: string selected in the following options:
|
||||||
will be truncated.
|
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
|
||||||
|
starting from the longest one at each token (when there is a pair of input sequences)
|
||||||
|
- 'only_first': Only truncate the first sequence
|
||||||
|
- 'only_second': Only truncate the second sequence
|
||||||
|
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
|
||||||
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
|
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
|
||||||
or PyTorch torch.Tensor instead of a list of python integers.
|
or PyTorch torch.Tensor instead of a list of python integers.
|
||||||
**kwargs: passed to the `self.tokenize()` method
|
**kwargs: passed to the `self.tokenize()` method
|
||||||
@@ -731,7 +729,7 @@ class PreTrainedTokenizer(object):
|
|||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
add_special_tokens=add_special_tokens,
|
add_special_tokens=add_special_tokens,
|
||||||
stride=stride,
|
stride=stride,
|
||||||
truncate_first_sequence=truncate_first_sequence,
|
truncation_strategy=truncation_strategy,
|
||||||
return_tensors=return_tensors,
|
return_tensors=return_tensors,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
|
|
||||||
@@ -743,7 +741,7 @@ class PreTrainedTokenizer(object):
|
|||||||
add_special_tokens=False,
|
add_special_tokens=False,
|
||||||
max_length=None,
|
max_length=None,
|
||||||
stride=0,
|
stride=0,
|
||||||
truncate_first_sequence=True,
|
truncation_strategy='longest_first',
|
||||||
return_tensors=None,
|
return_tensors=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""
|
"""
|
||||||
@@ -762,9 +760,13 @@ class PreTrainedTokenizer(object):
|
|||||||
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
|
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
|
||||||
If there are overflowing tokens, those will be added to the returned dictionary
|
If there are overflowing tokens, those will be added to the returned dictionary
|
||||||
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
|
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
|
||||||
from the main sequence returned. The value of this argument defined the number of additional tokens.
|
from the main sequence returned. The value of this argument defines the number of additional tokens.
|
||||||
truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
|
truncation_strategy: string selected in the following options:
|
||||||
will be truncated.
|
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
|
||||||
|
starting from the longest one at each token (when there is a pair of input sequences)
|
||||||
|
- 'only_first': Only truncate the first sequence
|
||||||
|
- 'only_second': Only truncate the second sequence
|
||||||
|
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
|
||||||
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
|
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
|
||||||
or PyTorch torch.Tensor instead of a list of python integers.
|
or PyTorch torch.Tensor instead of a list of python integers.
|
||||||
**kwargs: passed to the `self.tokenize()` method
|
**kwargs: passed to the `self.tokenize()` method
|
||||||
@@ -788,12 +790,11 @@ class PreTrainedTokenizer(object):
|
|||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
add_special_tokens=add_special_tokens,
|
add_special_tokens=add_special_tokens,
|
||||||
stride=stride,
|
stride=stride,
|
||||||
truncate_first_sequence=truncate_first_sequence,
|
truncation_strategy=truncation_strategy,
|
||||||
return_tensors=return_tensors)
|
return_tensors=return_tensors)
|
||||||
|
|
||||||
|
|
||||||
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
|
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
|
||||||
truncate_first_sequence=True, return_tensors=None):
|
truncation_strategy='longest_first', return_tensors=None):
|
||||||
"""
|
"""
|
||||||
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
|
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
|
||||||
It adds special tokens, truncates
|
It adds special tokens, truncates
|
||||||
@@ -810,41 +811,50 @@ class PreTrainedTokenizer(object):
|
|||||||
to their model.
|
to their model.
|
||||||
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
|
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
|
||||||
list of inputs.
|
list of inputs.
|
||||||
truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
|
truncation_strategy: string selected in the following options:
|
||||||
alongside a specified `max_length`, will truncate the first sequence if the total size is superior
|
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
|
||||||
than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
|
starting from the longest one at each token (when there is a pair of input sequences)
|
||||||
|
- 'only_first': Only truncate the first sequence
|
||||||
|
- 'only_second': Only truncate the second sequence
|
||||||
|
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
|
||||||
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
|
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
|
||||||
or PyTorch torch.Tensor instead of a list of python integers.
|
or PyTorch torch.Tensor instead of a list of python integers.
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
|
A Dictionary of shape::
|
||||||
|
|
||||||
|
{
|
||||||
|
input_ids: list[int],
|
||||||
|
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
|
||||||
|
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True``
|
||||||
|
}
|
||||||
|
|
||||||
|
With the fields:
|
||||||
|
``input_ids``: list of tokens to be fed to a model
|
||||||
|
|
||||||
|
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
|
||||||
|
|
||||||
|
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
|
||||||
|
tokens and 1 specifying sequence tokens.
|
||||||
"""
|
"""
|
||||||
pair = bool(pair_ids is not None)
|
pair = bool(pair_ids is not None)
|
||||||
len_ids = len(ids)
|
len_ids = len(ids)
|
||||||
len_pair_ids = len(pair_ids) if pair else 0
|
len_pair_ids = len(pair_ids) if pair else 0
|
||||||
|
|
||||||
encoded_inputs = {}
|
encoded_inputs = {}
|
||||||
if max_length:
|
total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
|
||||||
n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
|
if max_length and total_len > max_length:
|
||||||
if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
|
ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
|
||||||
logger.warning(
|
num_tokens_to_remove=total_len-max_length,
|
||||||
"You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
|
truncation_strategy=truncation_strategy,
|
||||||
"This pair of sequences will not be truncated.")
|
stride=stride)
|
||||||
else:
|
encoded_inputs["overflowing_tokens"] = overflowing_tokens
|
||||||
if n_added_tokens + len_ids + len_pair_ids > max_length:
|
encoded_inputs["num_truncated_tokens"] = total_len - max_length
|
||||||
if truncate_first_sequence or not pair:
|
|
||||||
encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
|
|
||||||
ids = ids[:max_length - len_pair_ids - n_added_tokens]
|
|
||||||
elif not truncate_first_sequence and pair:
|
|
||||||
encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
|
|
||||||
pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
"Cannot truncate second sequence as it is not provided. No truncation.")
|
|
||||||
|
|
||||||
if add_special_tokens:
|
if add_special_tokens:
|
||||||
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
|
sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
|
||||||
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
|
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
|
||||||
|
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
|
||||||
else:
|
else:
|
||||||
sequence = ids + pair_ids if pair else ids
|
sequence = ids + pair_ids if pair else ids
|
||||||
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
|
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
|
||||||
@@ -861,20 +871,89 @@ class PreTrainedTokenizer(object):
|
|||||||
encoded_inputs["input_ids"] = sequence
|
encoded_inputs["input_ids"] = sequence
|
||||||
encoded_inputs["token_type_ids"] = token_type_ids
|
encoded_inputs["token_type_ids"] = token_type_ids
|
||||||
|
|
||||||
|
if max_length and len(encoded_inputs["input_ids"]) > max_length:
|
||||||
|
encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
|
||||||
|
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
|
||||||
|
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
|
||||||
|
|
||||||
return encoded_inputs
|
return encoded_inputs
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
|
||||||
|
"""Truncates a sequence pair in place to the maximum length.
|
||||||
|
truncation_strategy: string selected in the following options:
|
||||||
|
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
|
||||||
|
starting from the longest one at each token (when there is a pair of input sequences).
|
||||||
|
Overflowing tokens only contains overflow from the first sequence.
|
||||||
|
- 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
|
||||||
|
- 'only_second': Only truncate the second sequence
|
||||||
|
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
|
||||||
|
"""
|
||||||
|
if num_tokens_to_remove <= 0:
|
||||||
|
return ids, pair_ids, []
|
||||||
|
|
||||||
|
if truncation_strategy == 'longest_first':
|
||||||
|
overflowing_tokens = []
|
||||||
|
for _ in range(num_tokens_to_remove):
|
||||||
|
if pair_ids is None or len(ids) > len(pair_ids):
|
||||||
|
overflowing_tokens = [ids[-1]] + overflowing_tokens
|
||||||
|
ids = ids[:-1]
|
||||||
|
else:
|
||||||
|
pair_ids = pair_ids[:-1]
|
||||||
|
window_len = min(len(ids), stride)
|
||||||
|
if window_len > 0:
|
||||||
|
overflowing_tokens = ids[-window_len:] + overflowing_tokens
|
||||||
|
elif truncation_strategy == 'only_first':
|
||||||
|
assert len(ids) > num_tokens_to_remove
|
||||||
|
window_len = min(len(ids), stride + num_tokens_to_remove)
|
||||||
|
overflowing_tokens = ids[-window_len:]
|
||||||
|
ids = ids[:-num_tokens_to_remove]
|
||||||
|
elif truncation_strategy == 'only_second':
|
||||||
|
assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
|
||||||
|
window_len = min(len(pair_ids), stride + num_tokens_to_remove)
|
||||||
|
overflowing_tokens = pair_ids[-window_len:]
|
||||||
|
pair_ids = pair_ids[:-num_tokens_to_remove]
|
||||||
|
elif truncation_strategy == 'do_not_truncate':
|
||||||
|
raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
|
||||||
|
else:
|
||||||
|
raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']")
|
||||||
|
return (ids, pair_ids, overflowing_tokens)
|
||||||
|
|
||||||
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
logger.warning("This tokenizer does not make use of special tokens.")
|
logger.warning("This tokenizer does not make use of special tokens.")
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(token_ids_0) * [0]
|
||||||
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
|
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
|
"""
|
||||||
return token_ids
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
|
by concatenating and adding special tokens.
|
||||||
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
|
A RoBERTa sequence has the following format:
|
||||||
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
|
single sequence: <s> X </s>
|
||||||
|
pair of sequences: <s> A </s></s> B </s>
|
||||||
|
"""
|
||||||
|
logger.warning("This tokenizer does not make use of special tokens. Input is returned with no modification.")
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return token_ids_0
|
||||||
return token_ids_0 + token_ids_1
|
return token_ids_0 + token_ids_1
|
||||||
|
|
||||||
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
|
"""
|
||||||
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0: list of ids (must not contain special tokens)
|
||||||
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||||
|
for sequence pairs
|
||||||
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
||||||
|
special tokens for the model
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
|
"""
|
||||||
|
return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
|
||||||
|
|
||||||
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
||||||
""" Converts a single index or a sequence of indices (integers) in a token "
|
""" Converts a single index or a sequence of indices (integers) in a token "
|
||||||
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
|
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
|
||||||
|
|||||||
@@ -754,32 +754,59 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
out_string = ''.join(tokens).replace('</w>', ' ').strip()
|
out_string = ''.join(tokens).replace('</w>', ' ').strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence for sequence classification tasks.
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
An XLM sequence has the following format: [CLS] X [SEP]
|
by concatenating and adding special tokens.
|
||||||
"""
|
A RoBERTa sequence has the following format:
|
||||||
return [self.cls_token_id] + token_ids + [self.sep_token_id]
|
single sequence: <s> X </s>
|
||||||
|
pair of sequences: <s> A </s></s> B </s>
|
||||||
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
|
|
||||||
"""
|
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
|
||||||
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
|
|
||||||
"""
|
"""
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
|
"""
|
||||||
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0: list of ids (must not contain special tokens)
|
||||||
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||||
|
for sequence pairs
|
||||||
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
||||||
|
special tokens for the model
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if already_has_special_tokens:
|
||||||
|
if token_ids_1 is not None:
|
||||||
|
raise ValueError("You should not supply a second sequence if the provided sequence of "
|
||||||
|
"ids is already formated with special tokens for the model.")
|
||||||
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||||
|
|
||||||
|
if token_ids_1 is not None:
|
||||||
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
An XLM sequence pair mask has the following format:
|
An XLM sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
| first sequence | second sequence
|
| first sequence | second sequence
|
||||||
|
|
||||||
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(cls + token_ids_0 + sep) * [0]
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
|
|||||||
@@ -181,36 +181,61 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
|
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def add_special_tokens_single_sequence(self, token_ids):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Adds special tokens to a sequence for sequence classification tasks.
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
An XLNet sequence has the following format: X [SEP][CLS]
|
by concatenating and adding special tokens.
|
||||||
|
A RoBERTa sequence has the following format:
|
||||||
|
single sequence: <s> X </s>
|
||||||
|
pair of sequences: <s> A </s></s> B </s>
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return token_ids + sep + cls
|
if token_ids_1 is None:
|
||||||
|
return token_ids_0 + sep + cls
|
||||||
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
|
|
||||||
"""
|
|
||||||
Adds special tokens to a sequence pair for sequence classification tasks.
|
|
||||||
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
|
|
||||||
"""
|
|
||||||
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
return token_ids_0 + sep + token_ids_1 + sep + cls
|
return token_ids_0 + sep + token_ids_1 + sep + cls
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
||||||
|
"""
|
||||||
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0: list of ids (must not contain special tokens)
|
||||||
|
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||||
|
for sequence pairs
|
||||||
|
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
||||||
|
special tokens for the model
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if already_has_special_tokens:
|
||||||
|
if token_ids_1 is not None:
|
||||||
|
raise ValueError("You should not supply a second sequence if the provided sequence of "
|
||||||
|
"ids is already formated with special tokens for the model.")
|
||||||
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||||
|
|
||||||
|
if token_ids_1 is not None:
|
||||||
|
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
|
||||||
|
return ([0] * len(token_ids_0)) + [1, 1]
|
||||||
|
|
||||||
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A BERT sequence pair mask has the following format:
|
A BERT sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
|
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
|
||||||
| first sequence | second sequence | CLS segment ID
|
| first sequence | second sequence | CLS segment ID
|
||||||
|
|
||||||
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
cls_segment_id = [2]
|
cls_segment_id = [2]
|
||||||
|
|
||||||
|
if token_ids_1 is None:
|
||||||
|
return len(token_ids_0 + sep + cls) * [0]
|
||||||
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
|
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
|
|||||||
Reference in New Issue
Block a user