diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py index 1e33190aca..7658fe4acd 100644 --- a/examples/distillation/distiller.py +++ b/examples/distillation/distiller.py @@ -112,7 +112,7 @@ class Distiller: self.last_log = 0 self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean') - self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) + self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100) if self.alpha_mse > 0.: self.mse_loss_fct = nn.MSELoss(reduction='sum') if self.alpha_cos > 0.: @@ -224,7 +224,7 @@ class Distiller: _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long() token_ids = token_ids.masked_scatter(pred_mask, _token_ids) - mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility + mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size @@ -254,7 +254,7 @@ class Distiller: attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]) clm_labels = token_ids.new(token_ids.size()).copy_(token_ids) - clm_labels[~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility + clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size diff --git a/examples/utils_ner.py b/examples/utils_ner.py index c20d7b0d1f..45ddeafbd5 100644 --- a/examples/utils_ner.py +++ b/examples/utils_ner.py @@ -94,7 +94,7 @@ def convert_examples_to_features(examples, pad_on_left=False, pad_token=0, pad_token_segment_id=0, - pad_token_label_id=-1, + pad_token_label_id=-100, sequence_a_segment_id=0, mask_padding_with_zero=True): """ Loads a data file into a list of `InputBatch`s