Fix eval ref miss in Chinese WWM. (#8115)
* ADD: add whole word mask proxy for both eng and chinese * MOD: adjust format * MOD: reformat code * MOD: update import * MOD: fix bug * MOD: add import * MOD: fix bug * MOD: decouple code and update readme * MOD: reformat code * Update examples/language-modeling/README.md Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update examples/language-modeling/README.md Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update examples/language-modeling/run_language_modeling.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update examples/language-modeling/run_language_modeling.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update examples/language-modeling/run_language_modeling.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update examples/language-modeling/run_language_modeling.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * change wwm to whole_word_mask * reformat code * reformat * format * Code quality * ADD: update chinese ref readme * MOD: small changes * MOD: small changes2 * update readme * fix eval ref file miss bug * format file * MOD: move ref code to contrib * MOD: add delimeter check * reformat code * refomat code * Update examples/language-modeling/README.md Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
@@ -128,15 +128,17 @@ class LineByLineWithRefDataset(Dataset):
|
||||
logger.info("Creating features from dataset file at %s", file_path)
|
||||
logger.info("Use ref segment results at %s", ref_path)
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
data = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
|
||||
batch_encoding = tokenizer(data, add_special_tokens=True, truncation=True, max_length=block_size)
|
||||
self.examples = batch_encoding["input_ids"]
|
||||
self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
|
||||
|
||||
data = f.readlines() # use this method to avoid delimiter '\u2029' to split a line
|
||||
data = [line.strip() for line in data if len(line) > 0 and not line.isspace()]
|
||||
# Get ref inf from file
|
||||
with open(ref_path, encoding="utf-8") as f:
|
||||
ref = [json.loads(line) for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
|
||||
assert len(data) == len(ref)
|
||||
|
||||
batch_encoding = tokenizer(data, add_special_tokens=True, truncation=True, max_length=block_size)
|
||||
self.examples = batch_encoding["input_ids"]
|
||||
self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
|
||||
|
||||
n = len(self.examples)
|
||||
for i in range(n):
|
||||
self.examples[i]["chinese_ref"] = torch.tensor(ref[i], dtype=torch.long)
|
||||
|
||||
Reference in New Issue
Block a user