Merge remote-tracking branch 'upstream/master'

This commit is contained in:
Dhanajit Brahma
2019-04-07 16:59:36 +05:30
11 changed files with 56 additions and 20 deletions

View File

@@ -57,7 +57,7 @@ class InputFeatures(object):
def convert_examples_to_features(examples, seq_length, tokenizer):
"""Loads a data file into a list of `InputBatch`s."""
"""Loads a data file into a list of `InputFeature`s."""
features = []
for (ex_index, example) in enumerate(examples):

View File

@@ -49,7 +49,7 @@ class DocumentDatabase:
self._precalculate_doc_weights()
rand_start = self.doc_cumsum[current_idx]
rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
sentence_index = randint(rand_start, rand_end) % self.cumsum_max
sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max
sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
else:
# If we don't use sentence weighting, then every doc has an equal chance to be chosen

View File

@@ -442,7 +442,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is

View File

@@ -85,9 +85,9 @@ class SquadExample(object):
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
if self.end_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
if self.is_impossible:
s += ", is_impossible: %r" % (self.is_impossible)
return s