diff --git a/examples/distillation/lm_seqs_dataset.py b/examples/distillation/lm_seqs_dataset.py index 691e010cf2..a29e9efb28 100644 --- a/examples/distillation/lm_seqs_dataset.py +++ b/examples/distillation/lm_seqs_dataset.py @@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset): self.check() self.remove_long_sequences() self.remove_empty_sequences() + self.remove_unknown_sequences() self.check() self.print_statistics() @@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset): new_size = len(self) logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.") + def remove_unknown_sequences(self): + """ + Remove sequences with a (too) high level of unknown tokens. + """ + if 'unk_token' not in self.params.special_tok_ids: + return + else: + unk_token_id = self.params.special_tok_ids['unk_token'] + init_size = len(self) + unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids]) + indices = (unk_occs/self.lengths) < 0.5 + self.token_ids = self.token_ids[indices] + self.lengths = self.lengths[indices] + new_size = len(self) + logger.info(f'Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).') + def print_statistics(self): """ Print some statistics on the corpus. Only the master process. diff --git a/examples/distillation/training_configs/distilbert-base-multilingual-cased.json b/examples/distillation/training_configs/distilbert-base-multilingual-cased.json new file mode 100644 index 0000000000..f76e7febcb --- /dev/null +++ b/examples/distillation/training_configs/distilbert-base-multilingual-cased.json @@ -0,0 +1,15 @@ +{ + "activation": "gelu", + "attention_dropout": 0.1, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "n_heads": 12, + "n_layers": 6, + "sinusoidal_pos_embds": true, + "tie_weights_": true, + "vocab_size": 119547 + } + \ No newline at end of file diff --git a/examples/distillation/training_configs/distilroberta-base.json b/examples/distillation/training_configs/distilroberta-base.json new file mode 100644 index 0000000000..2d90ef6380 --- /dev/null +++ b/examples/distillation/training_configs/distilroberta-base.json @@ -0,0 +1,14 @@ +{ + "vocab_size": 50265, + "hidden_size": 768, + "num_hidden_layers": 6, + "num_attention_heads": 12, + "intermediate_size": 3072, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 514, + "type_vocab_size": 1, + "initializer_range": 0.02, + "layer_norm_eps": 0.00001 +} \ No newline at end of file