minor spring cleaning - missing configs + processing
This commit is contained in:
@@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset):
|
||||
self.check()
|
||||
self.remove_long_sequences()
|
||||
self.remove_empty_sequences()
|
||||
self.remove_unknown_sequences()
|
||||
self.check()
|
||||
self.print_statistics()
|
||||
|
||||
@@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset):
|
||||
new_size = len(self)
|
||||
logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
|
||||
|
||||
def remove_unknown_sequences(self):
|
||||
"""
|
||||
Remove sequences with a (too) high level of unknown tokens.
|
||||
"""
|
||||
if 'unk_token' not in self.params.special_tok_ids:
|
||||
return
|
||||
else:
|
||||
unk_token_id = self.params.special_tok_ids['unk_token']
|
||||
init_size = len(self)
|
||||
unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
|
||||
indices = (unk_occs/self.lengths) < 0.5
|
||||
self.token_ids = self.token_ids[indices]
|
||||
self.lengths = self.lengths[indices]
|
||||
new_size = len(self)
|
||||
logger.info(f'Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).')
|
||||
|
||||
def print_statistics(self):
|
||||
"""
|
||||
Print some statistics on the corpus. Only the master process.
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"activation": "gelu",
|
||||
"attention_dropout": 0.1,
|
||||
"dim": 768,
|
||||
"dropout": 0.1,
|
||||
"hidden_dim": 3072,
|
||||
"initializer_range": 0.02,
|
||||
"max_position_embeddings": 512,
|
||||
"n_heads": 12,
|
||||
"n_layers": 6,
|
||||
"sinusoidal_pos_embds": true,
|
||||
"tie_weights_": true,
|
||||
"vocab_size": 119547
|
||||
}
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"vocab_size": 50265,
|
||||
"hidden_size": 768,
|
||||
"num_hidden_layers": 6,
|
||||
"num_attention_heads": 12,
|
||||
"intermediate_size": 3072,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"max_position_embeddings": 514,
|
||||
"type_vocab_size": 1,
|
||||
"initializer_range": 0.02,
|
||||
"layer_norm_eps": 0.00001
|
||||
}
|
||||
Reference in New Issue
Block a user