Fit chinese wwm to new datasets (#9887)

* MOD: fit chinese wwm to new datasets

* MOD: move wwm to new folder

* MOD: formate code

* Styling

* MOD add param and recover trainer

Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
This commit is contained in:
wlhgtc
2021-02-01 16:37:59 +08:00
committed by GitHub
parent 24881008a6
commit 1682804ebd
6 changed files with 249 additions and 67 deletions

View File

@@ -402,7 +402,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
# For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜,##欢]
if "chinese_ref" in e:
ref_pos = tolist(e["chinese_ref"])
len_seq = e["input_ids"].size(0)
len_seq = len(e["input_ids"])
for i in range(len_seq):
if i in ref_pos:
ref_tokens[i] = "##" + ref_tokens[i]