Merge branch 'master' of https://github.com/huggingface/pytorch-pretrained-BERT
This commit is contained in:
@@ -49,7 +49,7 @@ class DocumentDatabase:
|
|||||||
self._precalculate_doc_weights()
|
self._precalculate_doc_weights()
|
||||||
rand_start = self.doc_cumsum[current_idx]
|
rand_start = self.doc_cumsum[current_idx]
|
||||||
rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
|
rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
|
||||||
sentence_index = randint(rand_start, rand_end) % self.cumsum_max
|
sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max
|
||||||
sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
|
sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
|
||||||
else:
|
else:
|
||||||
# If we don't use sentence weighting, then every doc has an equal chance to be chosen
|
# If we don't use sentence weighting, then every doc has an equal chance to be chosen
|
||||||
|
|||||||
@@ -617,8 +617,14 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
|
hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
|
# Shift so that tokens < n predict n
|
||||||
|
shift_logits = lm_logits[:, :-1].contiguous()
|
||||||
|
shift_labels = lm_labels[:, 1:].contiguous()
|
||||||
|
|
||||||
|
# Flatten the tokens
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
|
shift_labels.view(-1))
|
||||||
return loss
|
return loss
|
||||||
return lm_logits, presents
|
return lm_logits, presents
|
||||||
|
|
||||||
@@ -690,8 +696,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
||||||
losses = []
|
losses = []
|
||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
|
shift_logits = lm_logits[:, :-1].contiguous()
|
||||||
|
shift_labels = lm_labels[:, 1:].contiguous()
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
|
losses.append(loss_fct(shift_logits.view(-1,
|
||||||
|
shift_logits.size(-1)), shift_labels.view(-1)))
|
||||||
if mc_labels is not None:
|
if mc_labels is not None:
|
||||||
loss_fct = CrossEntropyLoss()
|
loss_fct = CrossEntropyLoss()
|
||||||
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
||||||
|
|||||||
@@ -714,8 +714,14 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
|
hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
|
# Shift so that tokens < n predict n
|
||||||
|
shift_logits = lm_logits[:, :-1].contiguous()
|
||||||
|
shift_labels = lm_labels[:, 1:].contiguous()
|
||||||
|
|
||||||
|
# Flatten the tokens
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
|
shift_labels.view(-1))
|
||||||
return loss
|
return loss
|
||||||
return lm_logits
|
return lm_logits
|
||||||
|
|
||||||
@@ -801,8 +807,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
|
||||||
losses = []
|
losses = []
|
||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
|
shift_logits = lm_logits[:, :-1].contiguous()
|
||||||
|
shift_labels = lm_labels[:, 1:].contiguous()
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||||
losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
|
losses.append(loss_fct(shift_logits.view(-1,
|
||||||
|
shift_logits.size(-1)), shift_labels.view(-1)))
|
||||||
if mc_labels is not None:
|
if mc_labels is not None:
|
||||||
loss_fct = CrossEntropyLoss()
|
loss_fct = CrossEntropyLoss()
|
||||||
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
|
||||||
|
|||||||
@@ -26,7 +26,8 @@ logger = logging.getLogger(__name__)
|
|||||||
def warmup_cosine(x, warmup=0.002):
|
def warmup_cosine(x, warmup=0.002):
|
||||||
if x < warmup:
|
if x < warmup:
|
||||||
return x/warmup
|
return x/warmup
|
||||||
return 0.5 * (1.0 + torch.cos(math.pi * x))
|
x_ = (x - warmup) / (1 - warmup) # progress after warmup -
|
||||||
|
return 0.5 * (1. + math.cos(math.pi * x_))
|
||||||
|
|
||||||
def warmup_constant(x, warmup=0.002):
|
def warmup_constant(x, warmup=0.002):
|
||||||
""" Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
|
""" Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
|
||||||
|
|||||||
@@ -26,7 +26,8 @@ logger = logging.getLogger(__name__)
|
|||||||
def warmup_cosine(x, warmup=0.002):
|
def warmup_cosine(x, warmup=0.002):
|
||||||
if x < warmup:
|
if x < warmup:
|
||||||
return x/warmup
|
return x/warmup
|
||||||
return 0.5 * (1.0 + torch.cos(math.pi * x))
|
x_ = (x - warmup) / (1 - warmup) # progress after warmup
|
||||||
|
return 0.5 * (1. + math.cos(math.pi * x_))
|
||||||
|
|
||||||
def warmup_constant(x, warmup=0.002):
|
def warmup_constant(x, warmup=0.002):
|
||||||
""" Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps.
|
""" Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps.
|
||||||
|
|||||||
Reference in New Issue
Block a user