resolve PR comments
This commit is contained in:
@@ -16,10 +16,9 @@
|
||||
""" Finetuning seq2seq models for sequence generation."""
|
||||
|
||||
import argparse
|
||||
from collections import deque
|
||||
import functools
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
import sys
|
||||
|
||||
@@ -29,7 +28,22 @@ import torch
|
||||
from torch.optim import Adam
|
||||
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
|
||||
|
||||
from transformers import AutoTokenizer, PreTrainedSeq2seq, Model2Model
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
BertForMaskedLM,
|
||||
BertConfig,
|
||||
PreTrainedSeq2seq,
|
||||
Model2Model,
|
||||
)
|
||||
|
||||
from utils_summarization import (
|
||||
CNNDailyMailDataset,
|
||||
encode_for_summarization,
|
||||
fit_to_block_size,
|
||||
build_lm_labels,
|
||||
build_mask,
|
||||
compute_token_type_ids,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
@@ -46,194 +60,41 @@ def set_seed(args):
|
||||
# ------------
|
||||
|
||||
|
||||
class TextDataset(Dataset):
|
||||
""" Abstracts the dataset used to train seq2seq models.
|
||||
|
||||
CNN/Daily News:
|
||||
|
||||
The CNN/Daily News raw datasets are downloaded from [1]. The stories are
|
||||
stored in different files; the summary appears at the end of the story as
|
||||
sentences that are prefixed by the special `@highlight` line. To process
|
||||
the data, untar both datasets in the same folder, and pass the path to this
|
||||
folder as the "data_dir argument. The formatting code was inspired by [2].
|
||||
|
||||
[1] https://cs.nyu.edu/~kcho/
|
||||
[2] https://github.com/abisee/cnn-dailymail/
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer, prefix="train", data_dir="", block_size=512):
|
||||
assert os.path.isdir(data_dir)
|
||||
|
||||
# Load the features that have already been computed, if any
|
||||
cached_features_file = os.path.join(
|
||||
data_dir, "cached_lm_{}_{}".format(block_size, prefix)
|
||||
)
|
||||
if os.path.exists(cached_features_file):
|
||||
logger.info("Loading features from cached file %s", cached_features_file)
|
||||
with open(cached_features_file, "rb") as source:
|
||||
self.examples = pickle.load(source)
|
||||
return
|
||||
|
||||
logger.info("Creating features from dataset at %s", data_dir)
|
||||
datasets = ["cnn", "dailymail"]
|
||||
|
||||
self.examples = {"source": [], "target": []}
|
||||
for dataset in datasets:
|
||||
path_to_stories = os.path.join(data_dir, dataset, "stories")
|
||||
story_filenames_list = os.listdir(path_to_stories)
|
||||
for story_filename in story_filenames_list:
|
||||
path_to_story = os.path.join(path_to_stories, story_filename)
|
||||
if not os.path.isfile(path_to_story):
|
||||
continue
|
||||
|
||||
with open(path_to_story, encoding="utf-8") as source:
|
||||
raw_story = source.read()
|
||||
story_lines, summary_lines = process_story(raw_story)
|
||||
if len(summary_lines) == 0 or len(story_lines) == 0:
|
||||
continue
|
||||
|
||||
story_token_ids, summary_token_ids = _encode_for_summarization(
|
||||
story_lines, summary_lines, tokenizer
|
||||
)
|
||||
story_seq = _fit_to_block_size(story_token_ids, block_size)
|
||||
self.examples["source"].append(story_seq)
|
||||
|
||||
summary_seq = _fit_to_block_size(summary_token_ids, block_size)
|
||||
self.examples["summary"].append(summary_seq)
|
||||
|
||||
logger.info("Saving features into cache file %s", cached_features_file)
|
||||
with open(cached_features_file, "wb") as sink:
|
||||
pickle.dump(self.examples, sink, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, items):
|
||||
return (
|
||||
torch.tensor(self.examples["source"][items]),
|
||||
torch.tensor(self.examples["target"][items]),
|
||||
)
|
||||
|
||||
|
||||
def process_story(raw_story):
|
||||
""" Extract the story and summary from a story file.
|
||||
|
||||
Attributes:
|
||||
raw_story (str): content of the story file as an utf-8 encoded string.
|
||||
|
||||
Raises:
|
||||
IndexError: If the stoy is empty or contains no highlights.
|
||||
"""
|
||||
nonempty_lines = list(
|
||||
filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
|
||||
)
|
||||
|
||||
# for some unknown reason some lines miss a period, add it
|
||||
nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
|
||||
|
||||
# gather article lines
|
||||
story_lines = []
|
||||
lines = deque(nonempty_lines)
|
||||
while True:
|
||||
try:
|
||||
element = lines.popleft()
|
||||
if element.startswith("@highlight"):
|
||||
break
|
||||
story_lines.append(element)
|
||||
except IndexError:
|
||||
# if "@highlight" is absent from the file we pop
|
||||
# all elements until there is None.
|
||||
return story_lines, []
|
||||
|
||||
# gather summary lines
|
||||
summary_lines = list(filter(lambda t: not t.startswith("@highlight"), lines))
|
||||
|
||||
return story_lines, summary_lines
|
||||
|
||||
|
||||
def _encode_for_summarization(story_lines, summary_lines, tokenizer):
|
||||
""" Encode the story and summary lines, and join them
|
||||
as specified in [1] by using `[SEP] [CLS]` tokens to separate
|
||||
sentences.
|
||||
"""
|
||||
story_lines_token_ids = [
|
||||
tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
|
||||
for line in story_lines
|
||||
]
|
||||
summary_lines_token_ids = [
|
||||
tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
|
||||
for line in summary_lines
|
||||
]
|
||||
|
||||
story_token_ids = [
|
||||
token for sentence in story_lines_token_ids for token in sentence
|
||||
]
|
||||
summary_token_ids = [
|
||||
token for sentence in summary_lines_token_ids for token in sentence
|
||||
]
|
||||
|
||||
return story_token_ids, summary_token_ids
|
||||
|
||||
|
||||
def _add_missing_period(line):
|
||||
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
|
||||
if line.startswith("@highlight"):
|
||||
return line
|
||||
if line[-1] in END_TOKENS:
|
||||
return line
|
||||
return line + "."
|
||||
|
||||
|
||||
def _fit_to_block_size(sequence, block_size):
|
||||
""" Adapt the source and target sequences' lengths to the block size.
|
||||
If the sequence is shorter than the block size we pad it with -1 ids
|
||||
which correspond to padding tokens.
|
||||
"""
|
||||
if len(sequence) > block_size:
|
||||
return sequence[:block_size]
|
||||
else:
|
||||
sequence.extend([0] * (block_size - len(sequence)))
|
||||
return sequence
|
||||
|
||||
|
||||
def mask_padding_tokens(sequence):
|
||||
""" Padding token, encoded as 0, are represented by the value -1 in the
|
||||
masks """
|
||||
padded = sequence.clone()
|
||||
padded[padded == 0] = -1
|
||||
return padded
|
||||
|
||||
|
||||
def load_and_cache_examples(args, tokenizer):
|
||||
dataset = TextDataset(tokenizer, data_dir=args.data_dir)
|
||||
dataset = CNNDailyMailDataset(tokenizer, data_dir=args.data_dir)
|
||||
return dataset
|
||||
|
||||
|
||||
def compute_token_type_ids(batch, separator_token_id):
|
||||
""" Segment embeddings as described in [1]
|
||||
def collate(data, tokenizer, block_size):
|
||||
""" List of tuple as an input. """
|
||||
# remove the files with empty an story/summary, encode and fit to block
|
||||
data = filter(lambda x: not (len(x[0]) == 0 or len(x[1]) == 0), data)
|
||||
data = [
|
||||
encode_for_summarization(story, summary, tokenizer) for story, summary in data
|
||||
]
|
||||
data = [
|
||||
(
|
||||
fit_to_block_size(story, block_size, tokenizer.pad_token_id),
|
||||
fit_to_block_size(summary, block_size, tokenizer.pad_token_id),
|
||||
)
|
||||
for story, summary in data
|
||||
]
|
||||
|
||||
The values {0,1} were found in the repository [2].
|
||||
stories = torch.tensor([story for story, summary in data])
|
||||
summaries = torch.tensor([summary for story, summary in data])
|
||||
encoder_token_type_ids = compute_token_type_ids(stories, tokenizer.cls_token_id)
|
||||
encoder_mask = build_mask(stories, tokenizer.pad_token_id)
|
||||
decoder_mask = build_mask(summaries, tokenizer.pad_token_id)
|
||||
lm_labels = build_lm_labels(summaries, tokenizer.pad_token_id)
|
||||
|
||||
Attributes:
|
||||
batch: torch.Tensor, size [batch_size, block_size]
|
||||
Batch of input.
|
||||
separator_token_id: int
|
||||
The value of the token that separates the segments.
|
||||
|
||||
[1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
|
||||
arXiv preprint arXiv:1908.08345 (2019).
|
||||
[2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217)
|
||||
"""
|
||||
batch_embeddings = []
|
||||
sentence_num = 0
|
||||
for sequence in batch:
|
||||
embeddings = []
|
||||
for s in sequence:
|
||||
if s == separator_token_id:
|
||||
sentence_num += 1
|
||||
embeddings.append(sentence_num % 2)
|
||||
batch_embeddings.append(embeddings)
|
||||
return torch.tensor(batch_embeddings)
|
||||
return (
|
||||
stories,
|
||||
summaries,
|
||||
encoder_token_type_ids,
|
||||
encoder_mask,
|
||||
decoder_mask,
|
||||
lm_labels,
|
||||
)
|
||||
|
||||
|
||||
# ----------
|
||||
@@ -252,7 +113,7 @@ class BertSumOptimizer(object):
|
||||
arXiv preprint arXiv:1908.08345 (2019).
|
||||
"""
|
||||
|
||||
def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-9):
|
||||
def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-8):
|
||||
self.encoder = model.encoder
|
||||
self.decoder = model.decoder
|
||||
self.lr = lr
|
||||
@@ -306,8 +167,12 @@ def train(args, model, tokenizer):
|
||||
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
||||
train_dataset = load_and_cache_examples(args, tokenizer)
|
||||
train_sampler = RandomSampler(train_dataset)
|
||||
model_collate_fn = functools.partial(collate, tokenizer=tokenizer, block_size=512)
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset, sampler=train_sampler, batch_size=args.train_batch_size
|
||||
train_dataset,
|
||||
sampler=train_sampler,
|
||||
batch_size=args.train_batch_size,
|
||||
collate_fn=model_collate_fn,
|
||||
)
|
||||
|
||||
# Training schedule
|
||||
@@ -351,26 +216,23 @@ def train(args, model, tokenizer):
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
source, target = batch
|
||||
token_type_ids = compute_token_type_ids(source, tokenizer.cls_token_id)
|
||||
labels_src = mask_padding_tokens(source)
|
||||
labels_tgt = mask_padding_tokens(target)
|
||||
source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
|
||||
|
||||
source = source.to(args.device)
|
||||
target = target.to(args.device)
|
||||
token_type_ids = token_type_ids.to(args.device)
|
||||
labels_src = labels_src.to(args.device)
|
||||
labels_tgt = labels_tgt.to(args.device)
|
||||
encoder_token_type_ids = encoder_token_type_ids.to(args.device)
|
||||
encoder_mask = encoder_mask.to(args.device)
|
||||
decoder_mask = decoder_mask.to(args.device)
|
||||
lm_labels = lm_labels.to(args.device)
|
||||
|
||||
model.train()
|
||||
outputs = model(
|
||||
source,
|
||||
target,
|
||||
token_type_ids=token_type_ids,
|
||||
decoder_encoder_attention_mask=labels_src,
|
||||
decoder_attention_mask=labels_tgt,
|
||||
decoder_lm_labels=labels_tgt,
|
||||
decoder_initialize_randomly=True,
|
||||
encoder_token_type_ids=encoder_token_type_ids,
|
||||
encoder_attention_mask=encoder_mask,
|
||||
decoder_attention_mask=decoder_mask,
|
||||
decoder_lm_labels=lm_labels,
|
||||
)
|
||||
|
||||
loss = outputs[0]
|
||||
@@ -421,21 +283,23 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
model.eval()
|
||||
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
source, target = batch
|
||||
labels_src = mask_padding_tokens(source)
|
||||
labels_tgt = mask_padding_tokens(target)
|
||||
source.to(args.device)
|
||||
target.to(args.device)
|
||||
labels_src.to(args.device)
|
||||
labels_tgt.to(args.device)
|
||||
source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
|
||||
|
||||
source = source.to(args.device)
|
||||
target = target.to(args.device)
|
||||
encoder_token_type_ids = encoder_token_type_ids.to(args.device)
|
||||
encoder_mask = encoder_mask.to(args.device)
|
||||
decoder_mask = decoder_mask.to(args.device)
|
||||
lm_labels = lm_labels.to(args.device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(
|
||||
source,
|
||||
target,
|
||||
decoder_encoder_attention_mask=labels_src,
|
||||
decoder_attention_mask=labels_tgt,
|
||||
decoder_lm_labels=labels_tgt,
|
||||
encoder_token_type_ids=encoder_token_type_ids,
|
||||
encoder_attention_mask=encoder_mask,
|
||||
decoder_attention_mask=decoder_mask,
|
||||
decoder_lm_labels=lm_labels,
|
||||
)
|
||||
lm_loss = outputs[0]
|
||||
eval_loss += lm_loss.mean().item()
|
||||
@@ -525,7 +389,7 @@ def main():
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num_train_epochs",
|
||||
default=1,
|
||||
default=10,
|
||||
type=int,
|
||||
help="Total number of training epochs to perform.",
|
||||
)
|
||||
@@ -558,9 +422,13 @@ def main():
|
||||
args.device = torch.device("cuda")
|
||||
args.n_gpu = torch.cuda.device_count()
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
# Load pretrained model and tokenizer. The decoder's weights are randomly initialized.
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
|
||||
model = Model2Model.from_pretrained(args.model_name_or_path)
|
||||
config = BertConfig.from_pretrained(args.model_name_or_path)
|
||||
decoder_model = BertForMaskedLM(config)
|
||||
model = Model2Model.from_pretrained(
|
||||
args.model_name_or_path, decoder_model=decoder_model
|
||||
)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
|
||||
@@ -1,76 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import unittest
|
||||
|
||||
from run_summarization_finetuning import _fit_to_block_size, process_story
|
||||
|
||||
|
||||
class DataLoaderTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.block_size = 10
|
||||
|
||||
def test_truncate_sequence_too_small(self):
|
||||
""" Pad the sequence with 0 if the sequence is smaller than the block size."""
|
||||
sequence = [1, 2, 3, 4]
|
||||
expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
|
||||
self.assertEqual(_fit_to_block_size(sequence, self.block_size), expected_output)
|
||||
|
||||
def test_truncate_sequence_fit_exactly(self):
|
||||
sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||
expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||
self.assertEqual(_fit_to_block_size(sequence, self.block_size), expected_output)
|
||||
|
||||
def test_truncate_sequence_too_big(self):
|
||||
sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
|
||||
expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||
self.assertEqual(_fit_to_block_size(sequence, self.block_size), expected_output)
|
||||
|
||||
def test_process_story_no_highlights(self):
|
||||
""" Processing a story with no highlights should raise an exception.
|
||||
"""
|
||||
raw_story = """It was the year of Our Lord one thousand seven hundred and
|
||||
seventy-five.\n\nSpiritual revelations were conceded to England at that
|
||||
favoured period, as at this."""
|
||||
_, summary = process_story(raw_story)
|
||||
self.assertEqual(summary, [])
|
||||
|
||||
def test_process_empty_story(self):
|
||||
""" An empty story should also raise and exception.
|
||||
"""
|
||||
raw_story = ""
|
||||
story, summary = process_story(raw_story)
|
||||
self.assertEqual(story, [])
|
||||
self.assertEqual(summary, [])
|
||||
|
||||
def test_story_with_missing_period(self):
|
||||
raw_story = (
|
||||
"It was the year of Our Lord one thousand seven hundred and "
|
||||
"seventy-five\n\nSpiritual revelations were conceded to England "
|
||||
"at that favoured period, as at this.\n@highlight\n\nIt was the best of times"
|
||||
)
|
||||
story_lines, summary_lines = process_story(raw_story)
|
||||
|
||||
expected_story_lines = [
|
||||
"It was the year of Our Lord one thousand seven hundred and seventy-five.",
|
||||
"Spiritual revelations were conceded to England at that favoured period, as at this.",
|
||||
]
|
||||
self.assertEqual(expected_story_lines, story_lines)
|
||||
|
||||
expected_summary_lines = ["It was the best of times."]
|
||||
self.assertEqual(expected_summary_lines, summary_lines)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
184
examples/utils_summarization.py
Normal file
184
examples/utils_summarization.py
Normal file
@@ -0,0 +1,184 @@
|
||||
from collections import deque
|
||||
import os
|
||||
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
|
||||
# ------------
|
||||
# Data loading
|
||||
# ------------
|
||||
|
||||
|
||||
class CNNDailyMailDataset(Dataset):
|
||||
""" Abstracts the dataset used to train seq2seq models.
|
||||
|
||||
CNN/Daily News:
|
||||
|
||||
The CNN/Daily News raw datasets are downloaded from [1]. The stories are
|
||||
stored in different files; the summary appears at the end of the story as
|
||||
sentences that are prefixed by the special `@highlight` line. To process
|
||||
the data, untar both datasets in the same folder, and pass the path to this
|
||||
folder as the "data_dir argument. The formatting code was inspired by [2].
|
||||
|
||||
[1] https://cs.nyu.edu/~kcho/
|
||||
[2] https://github.com/abisee/cnn-dailymail/
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer, prefix="train", data_dir=""):
|
||||
assert os.path.isdir(data_dir)
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
# We initialize the class by listing all the files that contain
|
||||
# stories and summaries. Files are not read in memory given
|
||||
# the size of the corpus.
|
||||
self.stories_path = []
|
||||
datasets = ("cnn", "dailymail")
|
||||
for dataset in datasets:
|
||||
path_to_stories = os.path.join(data_dir, dataset, "stories")
|
||||
story_filenames_list = os.listdir(path_to_stories)
|
||||
for story_filename in story_filenames_list:
|
||||
path_to_story = os.path.join(path_to_stories, story_filename)
|
||||
if not os.path.isfile(path_to_story):
|
||||
continue
|
||||
self.stories_path.append(path_to_story)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.stories_path)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
story_path = self.stories_path[idx]
|
||||
with open(story_path, encoding="utf-8") as source:
|
||||
raw_story = source.read()
|
||||
story_lines, summary_lines = process_story(raw_story)
|
||||
return story_lines, summary_lines
|
||||
|
||||
|
||||
def process_story(raw_story):
|
||||
""" Extract the story and summary from a story file.
|
||||
|
||||
Attributes:
|
||||
raw_story (str): content of the story file as an utf-8 encoded string.
|
||||
|
||||
Raises:
|
||||
IndexError: If the stoy is empty or contains no highlights.
|
||||
"""
|
||||
nonempty_lines = list(
|
||||
filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
|
||||
)
|
||||
|
||||
# for some unknown reason some lines miss a period, add it
|
||||
nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
|
||||
|
||||
# gather article lines
|
||||
story_lines = []
|
||||
lines = deque(nonempty_lines)
|
||||
while True:
|
||||
try:
|
||||
element = lines.popleft()
|
||||
if element.startswith("@highlight"):
|
||||
break
|
||||
story_lines.append(element)
|
||||
except IndexError:
|
||||
# if "@highlight" is absent from the file we pop
|
||||
# all elements until there is None.
|
||||
return story_lines, []
|
||||
|
||||
# gather summary lines
|
||||
summary_lines = list(filter(lambda t: not t.startswith("@highlight"), lines))
|
||||
|
||||
return story_lines, summary_lines
|
||||
|
||||
|
||||
def _add_missing_period(line):
|
||||
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
|
||||
if line.startswith("@highlight"):
|
||||
return line
|
||||
if line[-1] in END_TOKENS:
|
||||
return line
|
||||
return line + "."
|
||||
|
||||
|
||||
# --------------------------
|
||||
# Encoding and preprocessing
|
||||
# --------------------------
|
||||
|
||||
|
||||
def fit_to_block_size(sequence, block_size, pad_token):
|
||||
""" Adapt the source and target sequences' lengths to the block size.
|
||||
If the sequence is shorter than the block size we pad it with -1 ids
|
||||
which correspond to padding tokens.
|
||||
"""
|
||||
if len(sequence) > block_size:
|
||||
return sequence[:block_size]
|
||||
else:
|
||||
sequence.extend([pad_token] * (block_size - len(sequence)))
|
||||
return sequence
|
||||
|
||||
|
||||
def build_lm_labels(sequence, pad_token):
|
||||
""" Padding token, encoded as 0, are represented by the value -1 so they
|
||||
are not taken into account in the loss computation. """
|
||||
padded = sequence.clone()
|
||||
padded[padded == pad_token] = -1
|
||||
return padded
|
||||
|
||||
|
||||
def build_mask(sequence, pad_token):
|
||||
""" Builds the mask. The attention mechanism will only attend to positions
|
||||
with value 1. """
|
||||
mask = sequence.clone()
|
||||
mask[mask != pad_token] = 1
|
||||
mask[mask == pad_token] = 0
|
||||
return mask
|
||||
|
||||
|
||||
def encode_for_summarization(story_lines, summary_lines, tokenizer):
|
||||
""" Encode the story and summary lines, and join them
|
||||
as specified in [1] by using `[SEP] [CLS]` tokens to separate
|
||||
sentences.
|
||||
"""
|
||||
story_lines_token_ids = [
|
||||
tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
|
||||
for line in story_lines
|
||||
]
|
||||
summary_lines_token_ids = [
|
||||
tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
|
||||
for line in summary_lines
|
||||
]
|
||||
|
||||
story_token_ids = [
|
||||
token for sentence in story_lines_token_ids for token in sentence
|
||||
]
|
||||
summary_token_ids = [
|
||||
token for sentence in summary_lines_token_ids for token in sentence
|
||||
]
|
||||
|
||||
return story_token_ids, summary_token_ids
|
||||
|
||||
|
||||
def compute_token_type_ids(batch, separator_token_id):
|
||||
""" Segment embeddings as described in [1]
|
||||
|
||||
The values {0,1} were found in the repository [2].
|
||||
|
||||
Attributes:
|
||||
batch: torch.Tensor, size [batch_size, block_size]
|
||||
Batch of input.
|
||||
separator_token_id: int
|
||||
The value of the token that separates the segments.
|
||||
|
||||
[1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
|
||||
arXiv preprint arXiv:1908.08345 (2019).
|
||||
[2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217)
|
||||
"""
|
||||
batch_embeddings = []
|
||||
for sequence in batch:
|
||||
sentence_num = 0
|
||||
embeddings = []
|
||||
for s in sequence:
|
||||
if s == separator_token_id:
|
||||
sentence_num += 1
|
||||
embeddings.append(sentence_num % 2)
|
||||
batch_embeddings.append(embeddings)
|
||||
return torch.tensor(batch_embeddings)
|
||||
133
examples/utils_summarization_test.py
Normal file
133
examples/utils_summarization_test.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2019 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from utils_summarization import (
|
||||
compute_token_type_ids,
|
||||
fit_to_block_size,
|
||||
build_mask,
|
||||
build_lm_labels,
|
||||
process_story,
|
||||
)
|
||||
|
||||
|
||||
class SummarizationDataProcessingTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.block_size = 10
|
||||
|
||||
def test_fit_to_block_sequence_too_small(self):
|
||||
""" Pad the sequence with 0 if the sequence is smaller than the block size."""
|
||||
sequence = [1, 2, 3, 4]
|
||||
expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
|
||||
self.assertEqual(
|
||||
fit_to_block_size(sequence, self.block_size, 0), expected_output
|
||||
)
|
||||
|
||||
def test_fit_to_block_sequence_fit_exactly(self):
|
||||
""" Do nothing if the sequence is the right size. """
|
||||
sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||
expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||
self.assertEqual(
|
||||
fit_to_block_size(sequence, self.block_size, 0), expected_output
|
||||
)
|
||||
|
||||
def test_fit_to_block_sequence_too_big(self):
|
||||
""" Truncate the sequence if it is too long. """
|
||||
sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
|
||||
expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||
self.assertEqual(
|
||||
fit_to_block_size(sequence, self.block_size, 0), expected_output
|
||||
)
|
||||
|
||||
def test_process_story_no_highlights(self):
|
||||
""" Processing a story with no highlights returns an empty list for the summary.
|
||||
"""
|
||||
raw_story = """It was the year of Our Lord one thousand seven hundred and
|
||||
seventy-five.\n\nSpiritual revelations were conceded to England at that
|
||||
favoured period, as at this."""
|
||||
_, summary_lines = process_story(raw_story)
|
||||
self.assertEqual(summary_lines, [])
|
||||
|
||||
def test_process_empty_story(self):
|
||||
""" An empty story returns an empty collection of lines.
|
||||
"""
|
||||
raw_story = ""
|
||||
story_lines, summary_lines = process_story(raw_story)
|
||||
self.assertEqual(story_lines, [])
|
||||
self.assertEqual(summary_lines, [])
|
||||
|
||||
def test_process_story_with_missing_period(self):
|
||||
raw_story = (
|
||||
"It was the year of Our Lord one thousand seven hundred and "
|
||||
"seventy-five\n\nSpiritual revelations were conceded to England "
|
||||
"at that favoured period, as at this.\n@highlight\n\nIt was the best of times"
|
||||
)
|
||||
story_lines, summary_lines = process_story(raw_story)
|
||||
|
||||
expected_story_lines = [
|
||||
"It was the year of Our Lord one thousand seven hundred and seventy-five.",
|
||||
"Spiritual revelations were conceded to England at that favoured period, as at this.",
|
||||
]
|
||||
self.assertEqual(expected_story_lines, story_lines)
|
||||
|
||||
expected_summary_lines = ["It was the best of times."]
|
||||
self.assertEqual(expected_summary_lines, summary_lines)
|
||||
|
||||
def test_build_lm_labels_no_padding(self):
|
||||
sequence = torch.tensor([1, 2, 3, 4])
|
||||
expected = sequence
|
||||
np.testing.assert_array_equal(
|
||||
build_lm_labels(sequence, 0).numpy(), expected.numpy()
|
||||
)
|
||||
|
||||
def test_build_lm_labels(self):
|
||||
sequence = torch.tensor([1, 2, 3, 4, 0, 0, 0])
|
||||
expected = torch.tensor([1, 2, 3, 4, -1, -1, -1])
|
||||
np.testing.assert_array_equal(
|
||||
build_lm_labels(sequence, 0).numpy(), expected.numpy()
|
||||
)
|
||||
|
||||
def test_build_mask_no_padding(self):
|
||||
sequence = torch.tensor([1, 2, 3, 4])
|
||||
expected = torch.tensor([1, 1, 1, 1])
|
||||
np.testing.assert_array_equal(
|
||||
build_mask(sequence, 0).numpy(), expected.numpy()
|
||||
)
|
||||
|
||||
def test_build_mask(self):
|
||||
sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
|
||||
expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
|
||||
np.testing.assert_array_equal(
|
||||
build_mask(sequence, 23).numpy(), expected.numpy()
|
||||
)
|
||||
|
||||
def test_compute_token_type_ids(self):
|
||||
separator = 101
|
||||
batch = torch.tensor(
|
||||
[[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
|
||||
)
|
||||
expected = torch.tensor(
|
||||
[[0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 1, 1, 1, 0, 0]]
|
||||
)
|
||||
|
||||
result = compute_token_type_ids(batch, separator)
|
||||
np.testing.assert_array_equal(result, expected)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user