Merge pull request #2130 from huggingface/ignored-index-coherence
[BREAKING CHANGE] Setting all ignored index to the PyTorch standard
This commit is contained in:
@@ -75,7 +75,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
|
||||
n_batch = len(dataset)
|
||||
input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
|
||||
mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
|
||||
lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
|
||||
lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
|
||||
mc_labels = np.zeros((n_batch,), dtype=np.int64)
|
||||
for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
|
||||
with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
||||
|
||||
@@ -112,7 +112,7 @@ class Distiller:
|
||||
self.last_log = 0
|
||||
|
||||
self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
|
||||
self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
|
||||
self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
|
||||
if self.alpha_mse > 0.:
|
||||
self.mse_loss_fct = nn.MSELoss(reduction='sum')
|
||||
if self.alpha_cos > 0.:
|
||||
@@ -186,7 +186,7 @@ class Distiller:
|
||||
-------
|
||||
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
||||
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
||||
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1 where there is nothing to predict.
|
||||
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict.
|
||||
"""
|
||||
token_ids, lengths = batch
|
||||
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
||||
@@ -224,7 +224,7 @@ class Distiller:
|
||||
_token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
|
||||
token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
|
||||
|
||||
mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
|
||||
mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
|
||||
|
||||
# sanity checks
|
||||
assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
|
||||
@@ -246,7 +246,7 @@ class Distiller:
|
||||
-------
|
||||
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
||||
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
||||
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict.
|
||||
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
|
||||
"""
|
||||
token_ids, lengths = batch
|
||||
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
||||
@@ -254,7 +254,7 @@ class Distiller:
|
||||
|
||||
attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
|
||||
clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
|
||||
clm_labels[~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
|
||||
clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
|
||||
|
||||
# sanity checks
|
||||
assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
|
||||
|
||||
@@ -150,7 +150,7 @@ def mask_tokens(inputs, tokenizer, args):
|
||||
special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
|
||||
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
|
||||
masked_indices = torch.bernoulli(probability_matrix).bool()
|
||||
labels[~masked_indices] = -1 # We only compute loss on masked tokens
|
||||
labels[~masked_indices] = -100 # We only compute loss on masked tokens
|
||||
|
||||
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
|
||||
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
|
||||
|
||||
@@ -94,7 +94,7 @@ def convert_examples_to_features(examples,
|
||||
pad_on_left=False,
|
||||
pad_token=0,
|
||||
pad_token_segment_id=0,
|
||||
pad_token_label_id=-1,
|
||||
pad_token_label_id=-100,
|
||||
sequence_a_segment_id=0,
|
||||
mask_padding_with_zero=True):
|
||||
""" Loads a data file into a list of `InputBatch`s
|
||||
|
||||
Reference in New Issue
Block a user