# Add whole word mask support for lm fine-tune (#7925)
* ADD: add whole word mask proxy for both eng and chinese * MOD: adjust format * MOD: reformat code * MOD: update import * MOD: fix bug * MOD: add import * MOD: fix bug * MOD: decouple code and update readme * MOD: reformat code * Update examples/language-modeling/README.md Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update examples/language-modeling/README.md Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update examples/language-modeling/run_language_modeling.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update examples/language-modeling/run_language_modeling.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update examples/language-modeling/run_language_modeling.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update examples/language-modeling/run_language_modeling.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * change wwm to whole_word_mask * reformat code * reformat * format * Code quality * ADD: update chinese ref readme * MOD: small changes * MOD: small changes2 * update readme Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import random
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
|
||||
|
||||
@@ -195,6 +196,124 @@ class DataCollatorForLanguageModeling:
|
||||
return inputs, labels
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
|
||||
"""
|
||||
Data collator used for language modeling.
|
||||
- collates batches of tensors, honoring their tokenizer's pad_token
|
||||
- preprocesses batches for masked language modeling
|
||||
"""
|
||||
|
||||
def __call__(
|
||||
self, examples: List[Union[List[int], torch.Tensor, Dict[str, torch.Tensor]]]
|
||||
) -> Dict[str, torch.Tensor]:
|
||||
if isinstance(examples[0], (dict, BatchEncoding)):
|
||||
input_ids = [e["input_ids"] for e in examples]
|
||||
else:
|
||||
input_ids = examples
|
||||
examples = [{"input_ids": e} for e in examples]
|
||||
|
||||
batch_input = self._tensorize_batch(input_ids)
|
||||
|
||||
mask_labels = []
|
||||
for e in examples:
|
||||
ref_tokens = []
|
||||
for id in e["input_ids"].tolist():
|
||||
token = self.tokenizer._convert_id_to_token(id)
|
||||
ref_tokens.append(token)
|
||||
|
||||
# For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢]
|
||||
if "chinese_ref" in e:
|
||||
ref_pos = e["chinese_ref"].tolist()
|
||||
len_seq = e["input_ids"].size(0)
|
||||
for i in range(len_seq):
|
||||
if i in ref_pos:
|
||||
ref_tokens[i] = "##" + ref_tokens[i]
|
||||
mask_labels.append(self._whole_word_mask(ref_tokens))
|
||||
batch_mask = self._tensorize_batch(mask_labels)
|
||||
inputs, labels = self.mask_tokens(batch_input, batch_mask)
|
||||
return {"input_ids": inputs, "labels": labels}
|
||||
|
||||
def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
|
||||
"""
|
||||
Get 0/1 labels for masked tokens with whole word mask proxy
|
||||
"""
|
||||
|
||||
cand_indexes = []
|
||||
for (i, token) in enumerate(input_tokens):
|
||||
if token == "[CLS]" or token == "[SEP]":
|
||||
continue
|
||||
|
||||
if len(cand_indexes) >= 1 and token.startswith("##"):
|
||||
cand_indexes[-1].append(i)
|
||||
else:
|
||||
cand_indexes.append([i])
|
||||
|
||||
random.shuffle(cand_indexes)
|
||||
num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))
|
||||
masked_lms = []
|
||||
covered_indexes = set()
|
||||
for index_set in cand_indexes:
|
||||
if len(masked_lms) >= num_to_predict:
|
||||
break
|
||||
# If adding a whole-word mask would exceed the maximum number of
|
||||
# predictions, then just skip this candidate.
|
||||
if len(masked_lms) + len(index_set) > num_to_predict:
|
||||
continue
|
||||
is_any_index_covered = False
|
||||
for index in index_set:
|
||||
if index in covered_indexes:
|
||||
is_any_index_covered = True
|
||||
break
|
||||
if is_any_index_covered:
|
||||
continue
|
||||
for index in index_set:
|
||||
covered_indexes.add(index)
|
||||
masked_lms.append(index)
|
||||
|
||||
assert len(covered_indexes) == len(masked_lms)
|
||||
mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
|
||||
return mask_labels
|
||||
|
||||
def mask_tokens(self, inputs: torch.Tensor, mask_labels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
|
||||
Set 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
|
||||
"""
|
||||
|
||||
if self.tokenizer.mask_token is None:
|
||||
raise ValueError(
|
||||
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
|
||||
)
|
||||
labels = inputs.clone()
|
||||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
||||
|
||||
probability_matrix = mask_labels
|
||||
|
||||
special_tokens_mask = [
|
||||
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
|
||||
]
|
||||
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
|
||||
if self.tokenizer._pad_token is not None:
|
||||
padding_mask = labels.eq(self.tokenizer.pad_token_id)
|
||||
probability_matrix.masked_fill_(padding_mask, value=0.0)
|
||||
|
||||
masked_indices = probability_matrix.bool()
|
||||
labels[~masked_indices] = -100 # We only compute loss on masked tokens
|
||||
|
||||
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
|
||||
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
|
||||
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
|
||||
|
||||
# 10% of the time, we replace masked input tokens with random word
|
||||
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
|
||||
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
|
||||
inputs[indices_random] = random_words[indices_random]
|
||||
|
||||
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
|
||||
return inputs, labels
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForSOP(DataCollatorForLanguageModeling):
|
||||
"""
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
from .glue import GlueDataset, GlueDataTrainingArguments
|
||||
from .language_modeling import (
|
||||
LineByLineTextDataset,
|
||||
LineByLineWithRefDataset,
|
||||
LineByLineWithSOPTextDataset,
|
||||
TextDataset,
|
||||
TextDatasetForNextSentencePrediction,
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
@@ -106,12 +107,48 @@ class LineByLineTextDataset(Dataset):
|
||||
|
||||
batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
|
||||
self.examples = batch_encoding["input_ids"]
|
||||
self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i) -> torch.Tensor:
|
||||
return torch.tensor(self.examples[i], dtype=torch.long)
|
||||
def __getitem__(self, i) -> Dict[str, torch.tensor]:
|
||||
return self.examples[i]
|
||||
|
||||
|
||||
class LineByLineWithRefDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
|
||||
assert os.path.isfile(file_path), f"Input file path {file_path} not found"
|
||||
assert os.path.isfile(ref_path), f"Ref file path {file_path} not found"
|
||||
# Here, we do not cache the features, operating under the assumption
|
||||
# that we will soon use fast multithreaded tokenizers from the
|
||||
# `tokenizers` repo everywhere =)
|
||||
logger.info("Creating features from dataset file at %s", file_path)
|
||||
logger.info("Use ref segment results at %s", ref_path)
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
data = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
|
||||
batch_encoding = tokenizer(data, add_special_tokens=True, truncation=True, max_length=block_size)
|
||||
self.examples = batch_encoding["input_ids"]
|
||||
self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
|
||||
|
||||
# Get ref inf from file
|
||||
with open(ref_path, encoding="utf-8") as f:
|
||||
ref = [json.loads(line) for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
|
||||
assert len(data) == len(ref)
|
||||
n = len(self.examples)
|
||||
for i in range(n):
|
||||
self.examples[i]["chinese_ref"] = torch.tensor(ref[i], dtype=torch.long)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i) -> Dict[str, torch.tensor]:
|
||||
return self.examples[i]
|
||||
|
||||
|
||||
class LineByLineWithSOPTextDataset(Dataset):
|
||||
|
||||
Reference in New Issue
Block a user