From 8163baab646c8ad76877ef073a707e88ea096bab Mon Sep 17 00:00:00 2001 From: Tim Rault Date: Thu, 1 Nov 2018 09:39:04 +0100 Subject: [PATCH] Convert indentation from 2 spaces to 4 spaces --- create_pretraining_data.py | 578 ++++++------- extract_features.py | 518 ++++++------ modeling.py | 1496 ++++++++++++++++----------------- modeling_test.py | 421 +++++----- optimization.py | 236 +++--- optimization_test.py | 42 +- run_classifier.py | 904 ++++++++++---------- run_pretraining.py | 618 +++++++------- run_squad.py | 1606 ++++++++++++++++++------------------ tokenization.py | 416 +++++----- tokenization_test.py | 146 ++-- 11 files changed, 3490 insertions(+), 3491 deletions(-) diff --git a/create_pretraining_data.py b/create_pretraining_data.py index e848025d96..b89e5ad2bb 100644 --- a/create_pretraining_data.py +++ b/create_pretraining_data.py @@ -63,379 +63,379 @@ flags.DEFINE_float( class TrainingInstance(object): - """A single training instance (sentence pair).""" + """A single training instance (sentence pair).""" - def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, - is_random_next): - self.tokens = tokens - self.segment_ids = segment_ids - self.is_random_next = is_random_next - self.masked_lm_positions = masked_lm_positions - self.masked_lm_labels = masked_lm_labels + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels - def __str__(self): - s = "" - s += "tokens: %s\n" % (" ".join( - [tokenization.printable_text(x) for x in self.tokens])) - s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) - s += "is_random_next: %s\n" % self.is_random_next - s += "masked_lm_positions: %s\n" % (" ".join( - [str(x) for x in self.masked_lm_positions])) - s += "masked_lm_labels: %s\n" % (" ".join( - [tokenization.printable_text(x) for x in self.masked_lm_labels])) - s += "\n" - return s + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s - def __repr__(self): - return self.__str__() + def __repr__(self): + return self.__str__() def write_instance_to_example_files(instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files): - """Create TF example files from `TrainingInstance`s.""" - writers = [] - for output_file in output_files: - writers.append(tf.python_io.TFRecordWriter(output_file)) + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) - writer_index = 0 + writer_index = 0 - total_written = 0 - for (inst_index, instance) in enumerate(instances): - input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) - input_mask = [1] * len(input_ids) - segment_ids = list(instance.segment_ids) - assert len(input_ids) <= max_seq_length + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length - while len(input_ids) < max_seq_length: - input_ids.append(0) - input_mask.append(0) - segment_ids.append(0) + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length - masked_lm_positions = list(instance.masked_lm_positions) - masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) - masked_lm_weights = [1.0] * len(masked_lm_ids) + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) - while len(masked_lm_positions) < max_predictions_per_seq: - masked_lm_positions.append(0) - masked_lm_ids.append(0) - masked_lm_weights.append(0.0) + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) - next_sentence_label = 1 if instance.is_random_next else 0 + next_sentence_label = 1 if instance.is_random_next else 0 - features = collections.OrderedDict() - features["input_ids"] = create_int_feature(input_ids) - features["input_mask"] = create_int_feature(input_mask) - features["segment_ids"] = create_int_feature(segment_ids) - features["masked_lm_positions"] = create_int_feature(masked_lm_positions) - features["masked_lm_ids"] = create_int_feature(masked_lm_ids) - features["masked_lm_weights"] = create_float_feature(masked_lm_weights) - features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) - tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) - writers[writer_index].write(tf_example.SerializeToString()) - writer_index = (writer_index + 1) % len(writers) + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) - total_written += 1 + total_written += 1 - if inst_index < 20: - tf.logging.info("*** Example ***") - tf.logging.info("tokens: %s" % " ".join( - [tokenization.printable_text(x) for x in instance.tokens])) + if inst_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) - for feature_name in features.keys(): - feature = features[feature_name] - values = [] - if feature.int64_list.value: - values = feature.int64_list.value - elif feature.float_list.value: - values = feature.float_list.value - tf.logging.info( - "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) - for writer in writers: - writer.close() + for writer in writers: + writer.close() - tf.logging.info("Wrote %d total instances", total_written) + tf.logging.info("Wrote %d total instances", total_written) def create_int_feature(values): - feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) - return feature + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature def create_float_feature(values): - feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) - return feature + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): - """Create `TrainingInstance`s from raw text.""" - all_documents = [[]] + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] - # Input file format: - # (1) One sentence per line. These should ideally be actual sentences, not - # entire paragraphs or arbitrary spans of text. (Because we use the - # sentence boundaries for the "next sentence prediction" task). - # (2) Blank lines between documents. Document boundaries are needed so - # that the "next sentence prediction" task doesn't span between documents. - for input_file in input_files: - with tf.gfile.GFile(input_file, "r") as reader: - while True: - line = tokenization.convert_to_unicode(reader.readline()) - if not line: - break - line = line.strip() + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + for input_file in input_files: + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() - # Empty lines are used as document delimiters - if not line: - all_documents.append([]) - tokens = tokenizer.tokenize(line) - if tokens: - all_documents[-1].append(tokens) + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) - # Remove empty documents - all_documents = [x for x in all_documents if x] - rng.shuffle(all_documents) + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) - vocab_words = list(tokenizer.vocab.keys()) - instances = [] - for _ in range(dupe_factor): - for document_index in range(len(all_documents)): - instances.extend( - create_instances_from_document( - all_documents, document_index, max_seq_length, short_seq_prob, - masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) - rng.shuffle(instances) - return instances + rng.shuffle(instances) + return instances def create_instances_from_document( - all_documents, document_index, max_seq_length, short_seq_prob, - masked_lm_prob, max_predictions_per_seq, vocab_words, rng): - """Creates `TrainingInstance`s for a single document.""" - document = all_documents[document_index] + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] - # Account for [CLS], [SEP], [SEP] - max_num_tokens = max_seq_length - 3 + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 - # We *usually* want to fill up the entire sequence since we are padding - # to `max_seq_length` anyways, so short sequences are generally wasted - # computation. However, we *sometimes* - # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter - # sequences to minimize the mismatch between pre-training and fine-tuning. - # The `target_seq_length` is just a rough target however, whereas - # `max_seq_length` is a hard limit. - target_seq_length = max_num_tokens - if rng.random() < short_seq_prob: - target_seq_length = rng.randint(2, max_num_tokens) + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) - # We DON'T just concatenate all of the tokens from a document into a long - # sequence and choose an arbitrary split point because this would make the - # next sentence prediction task too easy. Instead, we split the input into - # segments "A" and "B" based on the actual "sentences" provided by the user - # input. - instances = [] - current_chunk = [] - current_length = 0 - i = 0 - while i < len(document): - segment = document[i] - current_chunk.append(segment) - current_length += len(segment) - if i == len(document) - 1 or current_length >= target_seq_length: - if current_chunk: - # `a_end` is how many segments from `current_chunk` go into the `A` - # (first) sentence. - a_end = 1 - if len(current_chunk) >= 2: - a_end = rng.randint(1, len(current_chunk) - 1) + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) - tokens_a = [] - for j in range(a_end): - tokens_a.extend(current_chunk[j]) + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) - tokens_b = [] - # Random next - is_random_next = False - if len(current_chunk) == 1 or rng.random() < 0.5: - is_random_next = True - target_b_length = target_seq_length - len(tokens_a) + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) - # This should rarely go for more than one iteration for large - # corpora. However, just to be careful, we try to make sure that - # the random document is not the same as the document - # we're processing. - for _ in range(10): - random_document_index = rng.randint(0, len(all_documents) - 1) - if random_document_index != document_index: - break + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break - random_document = all_documents[random_document_index] - random_start = rng.randint(0, len(random_document) - 1) - for j in range(random_start, len(random_document)): - tokens_b.extend(random_document[j]) - if len(tokens_b) >= target_b_length: - break - # We didn't actually use these segments so we "put them back" so - # they don't go to waste. - num_unused_segments = len(current_chunk) - a_end - i -= num_unused_segments - # Actual next - else: - is_random_next = False - for j in range(a_end, len(current_chunk)): - tokens_b.extend(current_chunk[j]) - truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + random_document = all_documents[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) - assert len(tokens_a) >= 1 - assert len(tokens_b) >= 1 + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 - tokens = [] - segment_ids = [] - tokens.append("[CLS]") - segment_ids.append(0) - for token in tokens_a: - tokens.append(token) - segment_ids.append(0) + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) - tokens.append("[SEP]") - segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) - for token in tokens_b: - tokens.append(token) - segment_ids.append(1) - tokens.append("[SEP]") - segment_ids.append(1) + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) - (tokens, masked_lm_positions, - masked_lm_labels) = create_masked_lm_predictions( - tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) - instance = TrainingInstance( - tokens=tokens, - segment_ids=segment_ids, - is_random_next=is_random_next, - masked_lm_positions=masked_lm_positions, - masked_lm_labels=masked_lm_labels) - instances.append(instance) - current_chunk = [] - current_length = 0 - i += 1 + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 - return instances + return instances def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng): - """Creates the predictis for the masked LM objective.""" + """Creates the predictis for the masked LM objective.""" - cand_indexes = [] - for (i, token) in enumerate(tokens): - if token == "[CLS]" or token == "[SEP]": - continue - cand_indexes.append(i) + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + cand_indexes.append(i) - rng.shuffle(cand_indexes) + rng.shuffle(cand_indexes) - output_tokens = list(tokens) + output_tokens = list(tokens) - masked_lm = collections.namedtuple("masked_lm", ["index", "label"]) # pylint: disable=invalid-name + masked_lm = collections.namedtuple("masked_lm", ["index", "label"]) # pylint: disable=invalid-name - num_to_predict = min(max_predictions_per_seq, - max(1, int(round(len(tokens) * masked_lm_prob)))) + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) - masked_lms = [] - covered_indexes = set() - for index in cand_indexes: - if len(masked_lms) >= num_to_predict: - break - if index in covered_indexes: - continue - covered_indexes.add(index) + masked_lms = [] + covered_indexes = set() + for index in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + if index in covered_indexes: + continue + covered_indexes.add(index) - masked_token = None - # 80% of the time, replace with [MASK] - if rng.random() < 0.8: - masked_token = "[MASK]" - else: - # 10% of the time, keep original - if rng.random() < 0.5: - masked_token = tokens[index] - # 10% of the time, replace with random word - else: - masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] - output_tokens[index] = masked_token + output_tokens[index] = masked_token - masked_lms.append(masked_lm(index=index, label=tokens[index])) + masked_lms.append(masked_lm(index=index, label=tokens[index])) - masked_lms = sorted(masked_lms, key=lambda x: x.index) + masked_lms = sorted(masked_lms, key=lambda x: x.index) - masked_lm_positions = [] - masked_lm_labels = [] - for p in masked_lms: - masked_lm_positions.append(p.index) - masked_lm_labels.append(p.label) + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) - return (output_tokens, masked_lm_positions, masked_lm_labels) + return (output_tokens, masked_lm_positions, masked_lm_labels) def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): - """Truncates a pair of sequences to a maximum sequence length.""" - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_num_tokens: - break + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break - trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b - assert len(trunc_tokens) >= 1 + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 - # We want to sometimes truncate from the front and sometimes from the - # back to add more randomness and avoid biases. - if rng.random() < 0.5: - del trunc_tokens[0] - else: - trunc_tokens.pop() + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() def main(_): - tf.logging.set_verbosity(tf.logging.INFO) + tf.logging.set_verbosity(tf.logging.INFO) - tokenizer = tokenization.FullTokenizer( - vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) - input_files = [] - for input_pattern in FLAGS.input_file.split(","): - input_files.extend(tf.gfile.Glob(input_pattern)) + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) - tf.logging.info("*** Reading from input files ***") - for input_file in input_files: - tf.logging.info(" %s", input_file) + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) - rng = random.Random(FLAGS.random_seed) - instances = create_training_instances( - input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, - FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, - rng) + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) - output_files = FLAGS.output_file.split(",") - tf.logging.info("*** Writing to output files ***") - for output_file in output_files: - tf.logging.info(" %s", output_file) + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) - write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, - FLAGS.max_predictions_per_seq, output_files) + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) if __name__ == "__main__": - flags.mark_flag_as_required("input_file") - flags.mark_flag_as_required("output_file") - flags.mark_flag_as_required("vocab_file") - tf.app.run() + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() diff --git a/extract_features.py b/extract_features.py index 66e0a82330..dd6a10b4df 100644 --- a/extract_features.py +++ b/extract_features.py @@ -80,330 +80,330 @@ flags.DEFINE_bool( class InputExample(object): - def __init__(self, unique_id, text_a, text_b): - self.unique_id = unique_id - self.text_a = text_a - self.text_b = text_b + def __init__(self, unique_id, text_a, text_b): + self.unique_id = unique_id + self.text_a = text_a + self.text_b = text_b class InputFeatures(object): - """A single set of features of data.""" + """A single set of features of data.""" - def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): - self.unique_id = unique_id - self.tokens = tokens - self.input_ids = input_ids - self.input_mask = input_mask - self.input_type_ids = input_type_ids + def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): + self.unique_id = unique_id + self.tokens = tokens + self.input_ids = input_ids + self.input_mask = input_mask + self.input_type_ids = input_type_ids def input_fn_builder(features, seq_length): - """Creates an `input_fn` closure to be passed to TPUEstimator.""" + """Creates an `input_fn` closure to be passed to TPUEstimator.""" - all_unique_ids = [] - all_input_ids = [] - all_input_mask = [] - all_input_type_ids = [] + all_unique_ids = [] + all_input_ids = [] + all_input_mask = [] + all_input_type_ids = [] - for feature in features: - all_unique_ids.append(feature.unique_id) - all_input_ids.append(feature.input_ids) - all_input_mask.append(feature.input_mask) - all_input_type_ids.append(feature.input_type_ids) + for feature in features: + all_unique_ids.append(feature.unique_id) + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_input_type_ids.append(feature.input_type_ids) - def input_fn(params): - """The actual input function.""" - batch_size = params["batch_size"] + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] - num_examples = len(features) + num_examples = len(features) - # This is for demo purposes and does NOT scale to large data sets. We do - # not use Dataset.from_generator() because that uses tf.py_func which is - # not TPU compatible. The right way to load data is with TFRecordReader. - d = tf.data.Dataset.from_tensor_slices({ - "unique_ids": - tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), - "input_ids": - tf.constant( - all_input_ids, shape=[num_examples, seq_length], - dtype=tf.int32), - "input_mask": - tf.constant( - all_input_mask, - shape=[num_examples, seq_length], - dtype=tf.int32), - "input_type_ids": - tf.constant( - all_input_type_ids, - shape=[num_examples, seq_length], - dtype=tf.int32), - }) + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "unique_ids": + tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "input_type_ids": + tf.constant( + all_input_type_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + }) - d = d.batch(batch_size=batch_size, drop_remainder=False) - return d + d = d.batch(batch_size=batch_size, drop_remainder=False) + return d - return input_fn + return input_fn def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu, use_one_hot_embeddings): - """Returns `model_fn` closure for TPUEstimator.""" + """Returns `model_fn` closure for TPUEstimator.""" - def model_fn(features, labels, mode, params): # pylint: disable=unused-argument - """The `model_fn` for TPUEstimator.""" + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" - unique_ids = features["unique_ids"] - input_ids = features["input_ids"] - input_mask = features["input_mask"] - input_type_ids = features["input_type_ids"] + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + input_type_ids = features["input_type_ids"] - model = modeling.BertModel( - config=bert_config, - is_training=False, - input_ids=input_ids, - input_mask=input_mask, - token_type_ids=input_type_ids, - use_one_hot_embeddings=use_one_hot_embeddings) + model = modeling.BertModel( + config=bert_config, + is_training=False, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=input_type_ids, + use_one_hot_embeddings=use_one_hot_embeddings) - if mode != tf.estimator.ModeKeys.PREDICT: - raise ValueError("Only PREDICT modes are supported: %s" % (mode)) + if mode != tf.estimator.ModeKeys.PREDICT: + raise ValueError("Only PREDICT modes are supported: %s" % (mode)) - tvars = tf.trainable_variables() - scaffold_fn = None - (assignment_map, _) = modeling.get_assigment_map_from_checkpoint( - tvars, init_checkpoint) - if use_tpu: + tvars = tf.trainable_variables() + scaffold_fn = None + (assignment_map, _) = modeling.get_assigment_map_from_checkpoint( + tvars, init_checkpoint) + if use_tpu: - def tpu_scaffold(): - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - return tf.train.Scaffold() + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() - scaffold_fn = tpu_scaffold - else: - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - all_layers = model.get_all_encoder_layers() + all_layers = model.get_all_encoder_layers() - predictions = { - "unique_id": unique_ids, - } + predictions = { + "unique_id": unique_ids, + } - for (i, layer_index) in enumerate(layer_indexes): - predictions["layer_output_%d" % i] = all_layers[layer_index] + for (i, layer_index) in enumerate(layer_indexes): + predictions["layer_output_%d" % i] = all_layers[layer_index] - output_spec = tf.contrib.tpu.TPUEstimatorSpec( - mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) - return output_spec + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + return output_spec - return model_fn + return model_fn def convert_examples_to_features(examples, seq_length, tokenizer): - """Loads a data file into a list of `InputBatch`s.""" + """Loads a data file into a list of `InputBatch`s.""" - features = [] - for (ex_index, example) in enumerate(examples): - tokens_a = tokenizer.tokenize(example.text_a) + features = [] + for (ex_index, example) in enumerate(examples): + tokens_a = tokenizer.tokenize(example.text_a) - tokens_b = None - if example.text_b: - tokens_b = tokenizer.tokenize(example.text_b) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) - if tokens_b: - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) - else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > seq_length - 2: - tokens_a = tokens_a[0:(seq_length - 2)] + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > seq_length - 2: + tokens_a = tokens_a[0:(seq_length - 2)] - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambigiously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = [] - input_type_ids = [] - tokens.append("[CLS]") - input_type_ids.append(0) - for token in tokens_a: - tokens.append(token) - input_type_ids.append(0) - tokens.append("[SEP]") - input_type_ids.append(0) + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambigiously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + input_type_ids = [] + tokens.append("[CLS]") + input_type_ids.append(0) + for token in tokens_a: + tokens.append(token) + input_type_ids.append(0) + tokens.append("[SEP]") + input_type_ids.append(0) - if tokens_b: - for token in tokens_b: - tokens.append(token) - input_type_ids.append(1) - tokens.append("[SEP]") - input_type_ids.append(1) + if tokens_b: + for token in tokens_b: + tokens.append(token) + input_type_ids.append(1) + tokens.append("[SEP]") + input_type_ids.append(1) - input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_ids = tokenizer.convert_tokens_to_ids(tokens) - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1] * len(input_ids) + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) - # Zero-pad up to the sequence length. - while len(input_ids) < seq_length: - input_ids.append(0) - input_mask.append(0) - input_type_ids.append(0) + # Zero-pad up to the sequence length. + while len(input_ids) < seq_length: + input_ids.append(0) + input_mask.append(0) + input_type_ids.append(0) - assert len(input_ids) == seq_length - assert len(input_mask) == seq_length - assert len(input_type_ids) == seq_length + assert len(input_ids) == seq_length + assert len(input_mask) == seq_length + assert len(input_type_ids) == seq_length - if ex_index < 5: - tf.logging.info("*** Example ***") - tf.logging.info("unique_id: %s" % (example.unique_id)) - tf.logging.info("tokens: %s" % " ".join([str(x) for x in tokens])) - tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - tf.logging.info( - "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (example.unique_id)) + tf.logging.info("tokens: %s" % " ".join([str(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) - features.append( - InputFeatures( - unique_id=example.unique_id, - tokens=tokens, - input_ids=input_ids, - input_mask=input_mask, - input_type_ids=input_type_ids)) - return features + features.append( + InputFeatures( + unique_id=example.unique_id, + tokens=tokens, + input_ids=input_ids, + input_mask=input_mask, + input_type_ids=input_type_ids)) + return features def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" + """Truncates a sequence pair in place to the maximum length.""" - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() def read_examples(input_file): - """Read a list of `InputExample`s from an input file.""" - examples = [] - unique_id = 0 - with tf.gfile.GFile(input_file, "r") as reader: - while True: - line = tokenization.convert_to_unicode(reader.readline()) - if not line: - break - line = line.strip() - text_a = None - text_b = None - m = re.match(r"^(.*) \|\|\| (.*)$", line) - if m is None: - text_a = line - else: - text_a = m.group(1) - text_b = m.group(2) - examples.append( - InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) - unique_id += 1 - return examples + """Read a list of `InputExample`s from an input file.""" + examples = [] + unique_id = 0 + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + text_a = None + text_b = None + m = re.match(r"^(.*) \|\|\| (.*)$", line) + if m is None: + text_a = line + else: + text_a = m.group(1) + text_b = m.group(2) + examples.append( + InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) + unique_id += 1 + return examples def main(_): - tf.logging.set_verbosity(tf.logging.INFO) + tf.logging.set_verbosity(tf.logging.INFO) - layer_indexes = [int(x) for x in FLAGS.layers.split(",")] + layer_indexes = [int(x) for x in FLAGS.layers.split(",")] - bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) - tokenizer = tokenization.FullTokenizer( - vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) - is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 - run_config = tf.contrib.tpu.RunConfig( - master=FLAGS.master, - tpu_config=tf.contrib.tpu.TPUConfig( - num_shards=FLAGS.num_tpu_cores, - per_host_input_for_training=is_per_host)) + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + master=FLAGS.master, + tpu_config=tf.contrib.tpu.TPUConfig( + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) - examples = read_examples(FLAGS.input_file) + examples = read_examples(FLAGS.input_file) - features = convert_examples_to_features( - examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) + features = convert_examples_to_features( + examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) - unique_id_to_feature = {} - for feature in features: - unique_id_to_feature[feature.unique_id] = feature + unique_id_to_feature = {} + for feature in features: + unique_id_to_feature[feature.unique_id] = feature - model_fn = model_fn_builder( - bert_config=bert_config, - init_checkpoint=FLAGS.init_checkpoint, - layer_indexes=layer_indexes, - use_tpu=FLAGS.use_tpu, - use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + layer_indexes=layer_indexes, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) - # If TPU is not available, this will fall back to normal Estimator on CPU - # or GPU. - estimator = tf.contrib.tpu.TPUEstimator( - use_tpu=FLAGS.use_tpu, - model_fn=model_fn, - config=run_config, - predict_batch_size=FLAGS.batch_size) + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + predict_batch_size=FLAGS.batch_size) - input_fn = input_fn_builder( - features=features, seq_length=FLAGS.max_seq_length) + input_fn = input_fn_builder( + features=features, seq_length=FLAGS.max_seq_length) - with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, - "w")) as writer: - for result in estimator.predict(input_fn, yield_single_examples=True): - unique_id = int(result["unique_id"]) - feature = unique_id_to_feature[unique_id] - output_json = collections.OrderedDict() - output_json["linex_index"] = unique_id - all_features = [] - for (i, token) in enumerate(feature.tokens): - all_layers = [] - for (j, layer_index) in enumerate(layer_indexes): - layer_output = result["layer_output_%d" % j] - layers = collections.OrderedDict() - layers["index"] = layer_index - layers["values"] = [ - round(float(x), 6) for x in layer_output[i:(i + 1)].flat - ] - all_layers.append(layers) - features = collections.OrderedDict() - features["token"] = token - features["layers"] = all_layers - all_features.append(features) - output_json["features"] = all_features - writer.write(json.dumps(output_json) + "\n") + with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, + "w")) as writer: + for result in estimator.predict(input_fn, yield_single_examples=True): + unique_id = int(result["unique_id"]) + feature = unique_id_to_feature[unique_id] + output_json = collections.OrderedDict() + output_json["linex_index"] = unique_id + all_features = [] + for (i, token) in enumerate(feature.tokens): + all_layers = [] + for (j, layer_index) in enumerate(layer_indexes): + layer_output = result["layer_output_%d" % j] + layers = collections.OrderedDict() + layers["index"] = layer_index + layers["values"] = [ + round(float(x), 6) for x in layer_output[i:(i + 1)].flat + ] + all_layers.append(layers) + features = collections.OrderedDict() + features["token"] = token + features["layers"] = all_layers + all_features.append(features) + output_json["features"] = all_features + writer.write(json.dumps(output_json) + "\n") if __name__ == "__main__": - flags.mark_flag_as_required("input_file") - flags.mark_flag_as_required("vocab_file") - flags.mark_flag_as_required("bert_config_file") - flags.mark_flag_as_required("init_checkpoint") - flags.mark_flag_as_required("output_file") - tf.app.run() + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("init_checkpoint") + flags.mark_flag_as_required("output_file") + tf.app.run() diff --git a/modeling.py b/modeling.py index 8f599443b7..5e246fc927 100644 --- a/modeling.py +++ b/modeling.py @@ -28,354 +28,354 @@ import tensorflow as tf class BertConfig(object): - """Configuration for `BertModel`.""" + """Configuration for `BertModel`.""" - def __init__(self, - vocab_size, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - initializer_range=0.02): - """Constructs BertConfig. + def __init__(self, + vocab_size, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02): + """Constructs BertConfig. - Args: - vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. - hidden_size: Size of the encoder layers and the pooler layer. - num_hidden_layers: Number of hidden layers in the Transformer encoder. - num_attention_heads: Number of attention heads for each attention layer in - the Transformer encoder. - intermediate_size: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. - hidden_dropout_prob: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size: The vocabulary size of the `token_type_ids` passed into - `BertModel`. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - """ - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range + Args: + vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range - @classmethod - def from_dict(cls, json_object): - """Constructs a `BertConfig` from a Python dictionary of parameters.""" - config = BertConfig(vocab_size=None) - for (key, value) in six.iteritems(json_object): - config.__dict__[key] = value - return config + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size=None) + for (key, value) in six.iteritems(json_object): + config.__dict__[key] = value + return config - @classmethod - def from_json_file(cls, json_file): - """Constructs a `BertConfig` from a json file of parameters.""" - with tf.gfile.GFile(json_file, "r") as reader: - text = reader.read() - return cls.from_dict(json.loads(text)) + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with tf.gfile.GFile(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) - def to_dict(self): - """Serializes this instance to a Python dictionary.""" - output = copy.deepcopy(self.__dict__) - return output + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output - def to_json_string(self): - """Serializes this instance to a JSON string.""" - return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" class BertModel(object): - """BERT model ("Bidirectional Embedding Representations from a Transformer"). + """BERT model ("Bidirectional Embedding Representations from a Transformer"). - Example usage: + Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) - input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) - token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) - config = modeling.BertConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) - model = modeling.BertModel(config=config, is_training=True, - input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) - label_embeddings = tf.get_variable(...) - pooled_output = model.get_pooled_output() - logits = tf.matmul(pooled_output, label_embeddings) - ... - ``` - """ - - def __init__(self, - config, - is_training, - input_ids, - input_mask=None, - token_type_ids=None, - use_one_hot_embeddings=True, - scope=None): - """Constructor for BertModel. - - Args: - config: `BertConfig` instance. - is_training: bool. rue for training model, false for eval model. Controls - whether dropout will be applied. - input_ids: int32 Tensor of shape [batch_size, seq_length]. - input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. - token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. - use_one_hot_embeddings: (optional) bool. Whether to use one-hot word - embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, - it is must faster if this is True, on the CPU or GPU, it is faster if - this is False. - scope: (optional) variable scope. Defaults to "bert". - - Raises: - ValueError: The config is invalid or one of the input tensor shapes - is invalid. + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` """ - config = copy.deepcopy(config) - if not is_training: - config.hidden_dropout_prob = 0.0 - config.attention_probs_dropout_prob = 0.0 - input_shape = get_shape_list(input_ids, expected_rank=2) - batch_size = input_shape[0] - seq_length = input_shape[1] + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=True, + scope=None): + """Constructor for BertModel. - if input_mask is None: - input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + Args: + config: `BertConfig` instance. + is_training: bool. rue for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, + it is must faster if this is True, on the CPU or GPU, it is faster if + this is False. + scope: (optional) variable scope. Defaults to "bert". - if token_type_ids is None: - token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 - with tf.variable_scope("bert", scope): - with tf.variable_scope("embeddings"): - # Perform embedding lookup on the word ids. - (self.embedding_output, self.embedding_table) = embedding_lookup( - input_ids=input_ids, - vocab_size=config.vocab_size, - embedding_size=config.hidden_size, - initializer_range=config.initializer_range, - word_embedding_name="word_embeddings", - use_one_hot_embeddings=use_one_hot_embeddings) + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] - # Add positional embeddings and token type embeddings, then layer - # normalize and perform dropout. - self.embedding_output = embedding_postprocessor( - input_tensor=self.embedding_output, - use_token_type=True, - token_type_ids=token_type_ids, - token_type_vocab_size=config.type_vocab_size, - token_type_embedding_name="token_type_embeddings", - use_position_embeddings=True, - position_embedding_name="position_embeddings", - initializer_range=config.initializer_range, - max_position_embeddings=config.max_position_embeddings, - dropout_prob=config.hidden_dropout_prob) + if input_mask is None: + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) - with tf.variable_scope("encoder"): - # This converts a 2D mask of shape [batch_size, seq_length] to a 3D - # mask of shape [batch_size, seq_length, seq_length] which is used - # for the attention scores. - attention_mask = create_attention_mask_from_input_mask( - input_ids, input_mask) + if token_type_ids is None: + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) - # Run the stacked transformer. - # `sequence_output` shape = [batch_size, seq_length, hidden_size]. - self.all_encoder_layers = transformer_model( - input_tensor=self.embedding_output, - attention_mask=attention_mask, - hidden_size=config.hidden_size, - num_hidden_layers=config.num_hidden_layers, - num_attention_heads=config.num_attention_heads, - intermediate_size=config.intermediate_size, - intermediate_act_fn=get_activation(config.hidden_act), - hidden_dropout_prob=config.hidden_dropout_prob, - attention_probs_dropout_prob=config.attention_probs_dropout_prob, - initializer_range=config.initializer_range, - do_return_all_layers=True) + with tf.variable_scope("bert", scope): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids. + (self.embedding_output, self.embedding_table) = embedding_lookup( + input_ids=input_ids, + vocab_size=config.vocab_size, + embedding_size=config.hidden_size, + initializer_range=config.initializer_range, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) - self.sequence_output = self.all_encoder_layers[-1] - # The "pooler" converts the encoded sequence tensor of shape - # [batch_size, seq_length, hidden_size] to a tensor of shape - # [batch_size, hidden_size]. This is necessary for segment-level - # (or segment-pair-level) classification tasks where we need a fixed - # dimensional representation of the segment. - with tf.variable_scope("pooler"): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. We assume that this has been pre-trained - first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) - self.pooled_output = tf.layers.dense( - first_token_tensor, - config.hidden_size, - activation=tf.tanh, - kernel_initializer=create_initializer(config.initializer_range)) + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( + input_tensor=self.embedding_output, + use_token_type=True, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) - def get_pooled_output(self): - return self.pooled_output + with tf.variable_scope("encoder"): + # This converts a 2D mask of shape [batch_size, seq_length] to a 3D + # mask of shape [batch_size, seq_length, seq_length] which is used + # for the attention scores. + attention_mask = create_attention_mask_from_input_mask( + input_ids, input_mask) - def get_sequence_output(self): - """Gets final hidden layer of encoder. + # Run the stacked transformer. + # `sequence_output` shape = [batch_size, seq_length, hidden_size]. + self.all_encoder_layers = transformer_model( + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True) - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size] corresponding - to the final hidden of the transformer encoder. - """ - return self.sequence_output + self.sequence_output = self.all_encoder_layers[-1] + # The "pooler" converts the encoded sequence tensor of shape + # [batch_size, seq_length, hidden_size] to a tensor of shape + # [batch_size, hidden_size]. This is necessary for segment-level + # (or segment-pair-level) classification tasks where we need a fixed + # dimensional representation of the segment. + with tf.variable_scope("pooler"): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. We assume that this has been pre-trained + first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) + self.pooled_output = tf.layers.dense( + first_token_tensor, + config.hidden_size, + activation=tf.tanh, + kernel_initializer=create_initializer(config.initializer_range)) - def get_all_encoder_layers(self): - return self.all_encoder_layers + def get_pooled_output(self): + return self.pooled_output - def get_embedding_output(self): - """Gets output of the embedding lookup (i.e., input to the transformer). + def get_sequence_output(self): + """Gets final hidden layer of encoder. - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size] corresponding - to the output of the embedding layer, after summing the word - embeddings with the positional embeddings and the token type embeddings, - then performing layer normalization. This is the input to the transformer. - """ - return self.embedding_output + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the final hidden of the transformer encoder. + """ + return self.sequence_output - def get_embedding_table(self): - return self.embedding_table + def get_all_encoder_layers(self): + return self.all_encoder_layers + + def get_embedding_output(self): + """Gets output of the embedding lookup (i.e., input to the transformer). + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the output of the embedding layer, after summing the word + embeddings with the positional embeddings and the token type embeddings, + then performing layer normalization. This is the input to the transformer. + """ + return self.embedding_output + + def get_embedding_table(self): + return self.embedding_table def gelu(input_tensor): - """Gaussian Error Linear Unit. + """Gaussian Error Linear Unit. - This is a smoother version of the RELU. - Original paper: https://arxiv.org/abs/1606.08415 + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 - Args: - input_tensor: float Tensor to perform activation. + Args: + input_tensor: float Tensor to perform activation. - Returns: - `input_tensor` with the GELU activation applied. - """ - cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) - return input_tensor * cdf + Returns: + `input_tensor` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) + return input_tensor * cdf def get_activation(activation_string): - """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. + """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. - Args: - activation_string: String name of the activation function. + Args: + activation_string: String name of the activation function. - Returns: - A Python function corresponding to the activation function. If - `activation_string` is None, empty, or "linear", this will return None. - If `activation_string` is not a string, it will return `activation_string`. + Returns: + A Python function corresponding to the activation function. If + `activation_string` is None, empty, or "linear", this will return None. + If `activation_string` is not a string, it will return `activation_string`. - Raises: - ValueError: The `activation_string` does not correspond to a known - activation. - """ + Raises: + ValueError: The `activation_string` does not correspond to a known + activation. + """ - # We assume that anything that"s not a string is already an activation - # function, so we just return it. - if not isinstance(activation_string, six.string_types): - return activation_string + # We assume that anything that"s not a string is already an activation + # function, so we just return it. + if not isinstance(activation_string, six.string_types): + return activation_string - if not activation_string: - return None + if not activation_string: + return None - act = activation_string.lower() - if act == "linear": - return None - elif act == "relu": - return tf.nn.relu - elif act == "gelu": - return gelu - elif act == "tanh": - return tf.tanh - else: - raise ValueError("Unsupported activation: %s" % act) + act = activation_string.lower() + if act == "linear": + return None + elif act == "relu": + return tf.nn.relu + elif act == "gelu": + return gelu + elif act == "tanh": + return tf.tanh + else: + raise ValueError("Unsupported activation: %s" % act) def get_assigment_map_from_checkpoint(tvars, init_checkpoint): - """Compute the union of the current variables and checkpoint variables.""" - assignment_map = {} - initialized_variable_names = {} + """Compute the union of the current variables and checkpoint variables.""" + assignment_map = {} + initialized_variable_names = {} - name_to_variable = collections.OrderedDict() - for var in tvars: - name = var.name - m = re.match("^(.*):\\d+$", name) - if m is not None: - name = m.group(1) - name_to_variable[name] = var + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var - init_vars = tf.train.list_variables(init_checkpoint) + init_vars = tf.train.list_variables(init_checkpoint) - assignment_map = collections.OrderedDict() - for x in init_vars: - (name, var) = (x[0], x[1]) - if name not in name_to_variable: - continue - assignment_map[name] = name - initialized_variable_names[name] = 1 - initialized_variable_names[name + ":0"] = 1 + assignment_map = collections.OrderedDict() + for x in init_vars: + (name, var) = (x[0], x[1]) + if name not in name_to_variable: + continue + assignment_map[name] = name + initialized_variable_names[name] = 1 + initialized_variable_names[name + ":0"] = 1 - return (assignment_map, initialized_variable_names) + return (assignment_map, initialized_variable_names) def dropout(input_tensor, dropout_prob): - """Perform dropout. + """Perform dropout. - Args: - input_tensor: float Tensor. - dropout_prob: Python float. The probabiltiy of dropping out a value (NOT of - *keeping* a dimension as in `tf.nn.dropout`). + Args: + input_tensor: float Tensor. + dropout_prob: Python float. The probabiltiy of dropping out a value (NOT of + *keeping* a dimension as in `tf.nn.dropout`). - Returns: - A version of `input_tensor` with dropout applied. - """ - if dropout_prob is None or dropout_prob == 0.0: - return input_tensor + Returns: + A version of `input_tensor` with dropout applied. + """ + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor - output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) - return output + output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) + return output def layer_norm(input_tensor, name=None): - """Run layer normalization on the last dimension of the tensor.""" - return tf.contrib.layers.layer_norm( - inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + """Run layer normalization on the last dimension of the tensor.""" + return tf.contrib.layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): - """Runs layer normalization followed by dropout.""" - output_tensor = layer_norm(input_tensor, name) - output_tensor = dropout(output_tensor, dropout_prob) - return output_tensor + """Runs layer normalization followed by dropout.""" + output_tensor = layer_norm(input_tensor, name) + output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor def create_initializer(initializer_range=0.02): - """Creates a `truncated_normal_initializer` with the given range.""" - return tf.truncated_normal_initializer(stddev=initializer_range) + """Creates a `truncated_normal_initializer` with the given range.""" + return tf.truncated_normal_initializer(stddev=initializer_range) def embedding_lookup(input_ids, @@ -384,47 +384,47 @@ def embedding_lookup(input_ids, initializer_range=0.02, word_embedding_name="word_embeddings", use_one_hot_embeddings=False): - """Looks up words embeddings for id tensor. + """Looks up words embeddings for id tensor. - Args: - input_ids: int32 Tensor of shape [batch_size, seq_length] containing word - ids. - vocab_size: int. Size of the embedding vocabulary. - embedding_size: int. Width of the word embeddings. - initializer_range: float. Embedding initialization range. - word_embedding_name: string. Name of the embedding table. - use_one_hot_embeddings: bool. If True, use one-hot method for word - embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better - for TPUs. + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better + for TPUs. - Returns: - float Tensor of shape [batch_size, seq_length, embedding_size]. - """ - # This function assumes that the input is of shape [batch_size, seq_length, - # num_inputs]. - # - # If the input is a 2D tensor of shape [batch_size, seq_length], we - # reshape to [batch_size, seq_length, 1]. - if input_ids.shape.ndims == 2: - input_ids = tf.expand_dims(input_ids, axis=[-1]) + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) - embedding_table = tf.get_variable( - name=word_embedding_name, - shape=[vocab_size, embedding_size], - initializer=create_initializer(initializer_range)) + embedding_table = tf.get_variable( + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) - if use_one_hot_embeddings: - flat_input_ids = tf.reshape(input_ids, [-1]) - one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) - output = tf.matmul(one_hot_input_ids, embedding_table) - else: - output = tf.nn.embedding_lookup(embedding_table, input_ids) + if use_one_hot_embeddings: + flat_input_ids = tf.reshape(input_ids, [-1]) + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) + output = tf.matmul(one_hot_input_ids, embedding_table) + else: + output = tf.nn.embedding_lookup(embedding_table, input_ids) - input_shape = get_shape_list(input_ids) + input_shape = get_shape_list(input_ids) - output = tf.reshape(output, - input_shape[0:-1] + [input_shape[-1] * embedding_size]) - return (output, embedding_table) + output = tf.reshape(output, + input_shape[0:-1] + [input_shape[-1] * embedding_size]) + return (output, embedding_table) def embedding_postprocessor(input_tensor, @@ -437,131 +437,131 @@ def embedding_postprocessor(input_tensor, initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): - """Performs various post-processing on a word embedding tensor. + """Performs various post-processing on a word embedding tensor. - Args: - input_tensor: float Tensor of shape [batch_size, seq_length, - embedding_size]. - use_token_type: bool. Whether to add embeddings for `token_type_ids`. - token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. - Must be specified if `use_token_type` is True. - token_type_vocab_size: int. The vocabulary size of `token_type_ids`. - token_type_embedding_name: string. The name of the embedding table variable - for token type ids. - use_position_embeddings: bool. Whether to add position embeddings for the - position of each token in the sequence. - position_embedding_name: string. The name of the embedding table variable - for positional embeddings. - initializer_range: float. Range of the weight initialization. - max_position_embeddings: int. Maximum sequence length that might ever be - used with this model. This can be longer than the sequence length of - input_tensor, but cannot be shorter. - dropout_prob: float. Dropout probability applied to the final output tensor. + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. - Returns: - float tensor with same shape as `input_tensor`. + Returns: + float tensor with same shape as `input_tensor`. - Raises: - ValueError: One of the tensor shapes or input values is invalid. - """ - input_shape = get_shape_list(input_tensor, expected_rank=3) - batch_size = input_shape[0] - seq_length = input_shape[1] - width = input_shape[2] + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] - if seq_length > max_position_embeddings: - raise ValueError("The seq length (%d) cannot be greater than " - "`max_position_embeddings` (%d)" % - (seq_length, max_position_embeddings)) + if seq_length > max_position_embeddings: + raise ValueError("The seq length (%d) cannot be greater than " + "`max_position_embeddings` (%d)" % + (seq_length, max_position_embeddings)) - output = input_tensor + output = input_tensor - if use_token_type: - if token_type_ids is None: - raise ValueError("`token_type_ids` must be specified if" - "`use_token_type` is True.") - token_type_table = tf.get_variable( - name=token_type_embedding_name, - shape=[token_type_vocab_size, width], - initializer=create_initializer(initializer_range)) - # This vocab will be small so we always do one-hot here, since it is always - # faster for a small vocabulary. - flat_token_type_ids = tf.reshape(token_type_ids, [-1]) - one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) - token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) - token_type_embeddings = tf.reshape(token_type_embeddings, - [batch_size, seq_length, width]) - output += token_type_embeddings + if use_token_type: + if token_type_ids is None: + raise ValueError("`token_type_ids` must be specified if" + "`use_token_type` is True.") + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range)) + # This vocab will be small so we always do one-hot here, since it is always + # faster for a small vocabulary. + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape(token_type_embeddings, + [batch_size, seq_length, width]) + output += token_type_embeddings - if use_position_embeddings: - full_position_embeddings = tf.get_variable( - name=position_embedding_name, - shape=[max_position_embeddings, width], - initializer=create_initializer(initializer_range)) - # Since the position embedding table is a learned variable, we create it - # using a (long) sequence length `max_position_embeddings`. The actual - # sequence length might be shorter than this, for faster training of - # tasks that do not have long sequences. - # - # So `full_position_embeddings` is effectively an embedding table - # for position [0, 1, 2, ..., max_position_embeddings-1], and the current - # sequence has positions [0, 1, 2, ... seq_length-1], so we can just - # perform a slice. - if seq_length < max_position_embeddings: - position_embeddings = tf.slice(full_position_embeddings, [0, 0], - [seq_length, -1]) - else: - position_embeddings = full_position_embeddings + if use_position_embeddings: + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range)) + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. + if seq_length < max_position_embeddings: + position_embeddings = tf.slice(full_position_embeddings, [0, 0], + [seq_length, -1]) + else: + position_embeddings = full_position_embeddings - num_dims = len(output.shape.as_list()) + num_dims = len(output.shape.as_list()) - # Only the last two dimensions are relevant (`seq_length` and `width`), so - # we broadcast among the first dimensions, which is typically just - # the batch size. - position_broadcast_shape = [] - for _ in range(num_dims - 2): - position_broadcast_shape.append(1) - position_broadcast_shape.extend([seq_length, width]) - position_embeddings = tf.reshape(position_embeddings, - position_broadcast_shape) - output += position_embeddings + # Only the last two dimensions are relevant (`seq_length` and `width`), so + # we broadcast among the first dimensions, which is typically just + # the batch size. + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape(position_embeddings, + position_broadcast_shape) + output += position_embeddings - output = layer_norm_and_dropout(output, dropout_prob) - return output + output = layer_norm_and_dropout(output, dropout_prob) + return output def create_attention_mask_from_input_mask(from_tensor, to_mask): - """Create 3D attention mask from a 2D tensor mask. + """Create 3D attention mask from a 2D tensor mask. - Args: - from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. - to_mask: int32 Tensor of shape [batch_size, to_seq_length]. + Args: + from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. + to_mask: int32 Tensor of shape [batch_size, to_seq_length]. - Returns: - float Tensor of shape [batch_size, from_seq_length, to_seq_length]. - """ - from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) - batch_size = from_shape[0] - from_seq_length = from_shape[1] + Returns: + float Tensor of shape [batch_size, from_seq_length, to_seq_length]. + """ + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + batch_size = from_shape[0] + from_seq_length = from_shape[1] - to_shape = get_shape_list(to_mask, expected_rank=2) - to_seq_length = to_shape[1] + to_shape = get_shape_list(to_mask, expected_rank=2) + to_seq_length = to_shape[1] - to_mask = tf.cast( - tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) + to_mask = tf.cast( + tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) - # We don't assume that `from_tensor` is a mask (although it could be). We - # don't actually care if we attend *from* padding tokens (only *to* padding) - # tokens so we create a tensor of all ones. - # - # `broadcast_ones` = [batch_size, from_seq_length, 1] - broadcast_ones = tf.ones( - shape=[batch_size, from_seq_length, 1], dtype=tf.float32) + # We don't assume that `from_tensor` is a mask (although it could be). We + # don't actually care if we attend *from* padding tokens (only *to* padding) + # tokens so we create a tensor of all ones. + # + # `broadcast_ones` = [batch_size, from_seq_length, 1] + broadcast_ones = tf.ones( + shape=[batch_size, from_seq_length, 1], dtype=tf.float32) - # Here we broadcast along two dimensions to create the mask. - mask = broadcast_ones * to_mask + # Here we broadcast along two dimensions to create the mask. + mask = broadcast_ones * to_mask - return mask + return mask def attention_layer(from_tensor, @@ -578,185 +578,185 @@ def attention_layer(from_tensor, batch_size=None, from_seq_length=None, to_seq_length=None): - """Performs multi-headed attention from `from_tensor` to `to_tensor`. + """Performs multi-headed attention from `from_tensor` to `to_tensor`. - This is an implementation of multi-headed attention based on "Attention - is all you Need". If `from_tensor` and `to_tensor` are the same, then - this is self-attention. Each timestep in `from_tensor` attends to the - corresponding sequence in `to_tensor`, and returns a fixed-with vector. + This is an implementation of multi-headed attention based on "Attention + is all you Need". If `from_tensor` and `to_tensor` are the same, then + this is self-attention. Each timestep in `from_tensor` attends to the + corresponding sequence in `to_tensor`, and returns a fixed-with vector. - This function first projects `from_tensor` into a "query" tensor and - `to_tensor` into "key" and "value" tensors. These are (effectively) a list - of tensors of length `num_attention_heads`, where each tensor is of shape - [batch_size, seq_length, size_per_head]. + This function first projects `from_tensor` into a "query" tensor and + `to_tensor` into "key" and "value" tensors. These are (effectively) a list + of tensors of length `num_attention_heads`, where each tensor is of shape + [batch_size, seq_length, size_per_head]. - Then, the query and key tensors are dot-producted and scaled. These are - softmaxed to obtain attention probabilities. The value tensors are then - interpolated by these probabilities, then concatenated back to a single - tensor and returned. + Then, the query and key tensors are dot-producted and scaled. These are + softmaxed to obtain attention probabilities. The value tensors are then + interpolated by these probabilities, then concatenated back to a single + tensor and returned. - In practice, the multi-headed attention are done with transposes and - reshapes rather than actual separate tensors. + In practice, the multi-headed attention are done with transposes and + reshapes rather than actual separate tensors. - Args: - from_tensor: float Tensor of shape [batch_size, from_seq_length, - from_width]. - to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. - attention_mask: (optional) int32 Tensor of shape [batch_size, - from_seq_length, to_seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions in - the mask that are 0, and will be unchaged for positions that are 1. - num_attention_heads: int. Number of attention heads. - size_per_head: int. Size of each attention head. - query_act: (optional) Activation function for the query transform. - key_act: (optional) Activation function for the key transform. - value_act: (optional) Activation function for the value transform. - attention_probs_dropout_prob: - initializer_range: float. Range of the weight initializer. - do_return_2d_tensor: bool. If True, the output will be of shape [batch_size - * from_seq_length, num_attention_heads * size_per_head]. If False, the - output will be of shape [batch_size, from_seq_length, num_attention_heads - * size_per_head]. - batch_size: (Optional) int. If the input is 2D, this might be the batch size - of the 3D version of the `from_tensor` and `to_tensor`. - from_seq_length: (Optional) If the input is 2D, this might be the seq length - of the 3D version of the `from_tensor`. - to_seq_length: (Optional) If the input is 2D, this might be the seq length - of the 3D version of the `to_tensor`. + Args: + from_tensor: float Tensor of shape [batch_size, from_seq_length, + from_width]. + to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. + attention_mask: (optional) int32 Tensor of shape [batch_size, + from_seq_length, to_seq_length]. The values should be 1 or 0. The + attention scores will effectively be set to -infinity for any positions in + the mask that are 0, and will be unchaged for positions that are 1. + num_attention_heads: int. Number of attention heads. + size_per_head: int. Size of each attention head. + query_act: (optional) Activation function for the query transform. + key_act: (optional) Activation function for the key transform. + value_act: (optional) Activation function for the value transform. + attention_probs_dropout_prob: + initializer_range: float. Range of the weight initializer. + do_return_2d_tensor: bool. If True, the output will be of shape [batch_size + * from_seq_length, num_attention_heads * size_per_head]. If False, the + output will be of shape [batch_size, from_seq_length, num_attention_heads + * size_per_head]. + batch_size: (Optional) int. If the input is 2D, this might be the batch size + of the 3D version of the `from_tensor` and `to_tensor`. + from_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `from_tensor`. + to_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `to_tensor`. - Returns: - float Tensor of shape [batch_size, from_seq_length, - num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is - true, this will be of shape [batch_size * from_seq_length, - num_attention_heads * size_per_head]). + Returns: + float Tensor of shape [batch_size, from_seq_length, + num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is + true, this will be of shape [batch_size * from_seq_length, + num_attention_heads * size_per_head]). - Raises: - ValueError: Any of the arguments or tensor shapes are invalid. - """ + Raises: + ValueError: Any of the arguments or tensor shapes are invalid. + """ - def transpose_for_scores(input_tensor, batch_size, num_attention_heads, - seq_length, width): - output_tensor = tf.reshape( - input_tensor, [batch_size, seq_length, num_attention_heads, width]) + def transpose_for_scores(input_tensor, batch_size, num_attention_heads, + seq_length, width): + output_tensor = tf.reshape( + input_tensor, [batch_size, seq_length, num_attention_heads, width]) - output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) - return output_tensor + output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) + return output_tensor - from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) - to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) - if len(from_shape) != len(to_shape): - raise ValueError( - "The rank of `from_tensor` must match the rank of `to_tensor`.") + if len(from_shape) != len(to_shape): + raise ValueError( + "The rank of `from_tensor` must match the rank of `to_tensor`.") - if len(from_shape) == 3: - batch_size = from_shape[0] - from_seq_length = from_shape[1] - to_seq_length = to_shape[1] - elif len(from_shape) == 2: - if (batch_size is None or from_seq_length is None or to_seq_length is None): - raise ValueError( - "When passing in rank 2 tensors to attention_layer, the values " - "for `batch_size`, `from_seq_length`, and `to_seq_length` " - "must all be specified.") + if len(from_shape) == 3: + batch_size = from_shape[0] + from_seq_length = from_shape[1] + to_seq_length = to_shape[1] + elif len(from_shape) == 2: + if (batch_size is None or from_seq_length is None or to_seq_length is None): + raise ValueError( + "When passing in rank 2 tensors to attention_layer, the values " + "for `batch_size`, `from_seq_length`, and `to_seq_length` " + "must all be specified.") - # Scalar dimensions referenced here: - # B = batch size (number of sequences) - # F = `from_tensor` sequence length - # T = `to_tensor` sequence length - # N = `num_attention_heads` - # H = `size_per_head` + # Scalar dimensions referenced here: + # B = batch size (number of sequences) + # F = `from_tensor` sequence length + # T = `to_tensor` sequence length + # N = `num_attention_heads` + # H = `size_per_head` - from_tensor_2d = reshape_to_matrix(from_tensor) - to_tensor_2d = reshape_to_matrix(to_tensor) + from_tensor_2d = reshape_to_matrix(from_tensor) + to_tensor_2d = reshape_to_matrix(to_tensor) - # `query_layer` = [B*F, N*H] - query_layer = tf.layers.dense( - from_tensor_2d, - num_attention_heads * size_per_head, - activation=query_act, - name="query", - kernel_initializer=create_initializer(initializer_range)) + # `query_layer` = [B*F, N*H] + query_layer = tf.layers.dense( + from_tensor_2d, + num_attention_heads * size_per_head, + activation=query_act, + name="query", + kernel_initializer=create_initializer(initializer_range)) - # `key_layer` = [B*T, N*H] - key_layer = tf.layers.dense( - to_tensor_2d, - num_attention_heads * size_per_head, - activation=key_act, - name="key", - kernel_initializer=create_initializer(initializer_range)) + # `key_layer` = [B*T, N*H] + key_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=key_act, + name="key", + kernel_initializer=create_initializer(initializer_range)) - # `value_layer` = [B*T, N*H] - value_layer = tf.layers.dense( - to_tensor_2d, - num_attention_heads * size_per_head, - activation=value_act, - name="value", - kernel_initializer=create_initializer(initializer_range)) + # `value_layer` = [B*T, N*H] + value_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=value_act, + name="value", + kernel_initializer=create_initializer(initializer_range)) - # `query_layer` = [B, N, F, H] - query_layer = transpose_for_scores(query_layer, batch_size, - num_attention_heads, from_seq_length, - size_per_head) + # `query_layer` = [B, N, F, H] + query_layer = transpose_for_scores(query_layer, batch_size, + num_attention_heads, from_seq_length, + size_per_head) - # `key_layer` = [B, N, T, H] - key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, - to_seq_length, size_per_head) + # `key_layer` = [B, N, T, H] + key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, + to_seq_length, size_per_head) - # Take the dot product between "query" and "key" to get the raw - # attention scores. - # `attention_scores` = [B, N, F, T] - attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) - attention_scores = tf.multiply(attention_scores, - 1.0 / math.sqrt(float(size_per_head))) + # Take the dot product between "query" and "key" to get the raw + # attention scores. + # `attention_scores` = [B, N, F, T] + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + attention_scores = tf.multiply(attention_scores, + 1.0 / math.sqrt(float(size_per_head))) - if attention_mask is not None: - # `attention_mask` = [B, 1, F, T] - attention_mask = tf.expand_dims(attention_mask, axis=[1]) + if attention_mask is not None: + # `attention_mask` = [B, 1, F, T] + attention_mask = tf.expand_dims(attention_mask, axis=[1]) - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and -10000.0 for masked positions. - adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - attention_scores += adder + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_scores += adder - # Normalize the attention scores to probabilities. - # `attention_probs` = [B, N, F, T] - attention_probs = tf.nn.softmax(attention_scores) + # Normalize the attention scores to probabilities. + # `attention_probs` = [B, N, F, T] + attention_probs = tf.nn.softmax(attention_scores) - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = dropout(attention_probs, attention_probs_dropout_prob) + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = dropout(attention_probs, attention_probs_dropout_prob) - # `value_layer` = [B, T, N, H] - value_layer = tf.reshape( - value_layer, - [batch_size, to_seq_length, num_attention_heads, size_per_head]) + # `value_layer` = [B, T, N, H] + value_layer = tf.reshape( + value_layer, + [batch_size, to_seq_length, num_attention_heads, size_per_head]) - # `value_layer` = [B, N, T, H] - value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) + # `value_layer` = [B, N, T, H] + value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) - # `context_layer` = [B, N, F, H] - context_layer = tf.matmul(attention_probs, value_layer) + # `context_layer` = [B, N, F, H] + context_layer = tf.matmul(attention_probs, value_layer) - # `context_layer` = [B, F, N, H] - context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) + # `context_layer` = [B, F, N, H] + context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) - if do_return_2d_tensor: - # `context_layer` = [B*F, N*V] - context_layer = tf.reshape( - context_layer, - [batch_size * from_seq_length, num_attention_heads * size_per_head]) - else: - # `context_layer` = [B, F, N*V] - context_layer = tf.reshape( - context_layer, - [batch_size, from_seq_length, num_attention_heads * size_per_head]) + if do_return_2d_tensor: + # `context_layer` = [B*F, N*V] + context_layer = tf.reshape( + context_layer, + [batch_size * from_seq_length, num_attention_heads * size_per_head]) + else: + # `context_layer` = [B, F, N*V] + context_layer = tf.reshape( + context_layer, + [batch_size, from_seq_length, num_attention_heads * size_per_head]) - return context_layer + return context_layer def transformer_model(input_tensor, @@ -770,225 +770,225 @@ def transformer_model(input_tensor, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False): - """Multi-headed, multi-layer Transformer from "Attention is All You Need". + """Multi-headed, multi-layer Transformer from "Attention is All You Need". - This is almost an exact implementation of the original Transformer encoder. + This is almost an exact implementation of the original Transformer encoder. - See the original paper: - https://arxiv.org/abs/1706.03762 + See the original paper: + https://arxiv.org/abs/1706.03762 - Also see: - https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py - Args: - input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. - attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, - seq_length], with 1 for positions that can be attended to and 0 in - positions that should not be. - hidden_size: int. Hidden size of the Transformer. - num_hidden_layers: int. Number of layers (blocks) in the Transformer. - num_attention_heads: int. Number of attention heads in the Transformer. - intermediate_size: int. The size of the "intermediate" (a.k.a., feed - forward) layer. - intermediate_act_fn: function. The non-linear activation function to apply - to the output of the intermediate/feed-forward layer. - hidden_dropout_prob: float. Dropout probability for the hidden layers. - attention_probs_dropout_prob: float. Dropout probability of the attention - probabilities. - initializer_range: float. Range of the initializer (stddev of truncated - normal). - do_return_all_layers: Whether to also return all layers or just the final - layer. + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size], the final - hidden layer of the Transformer. + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. - Raises: - ValueError: A Tensor shape or parameter is invalid. - """ - if hidden_size % num_attention_heads != 0: - raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (hidden_size, num_attention_heads)) + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) - attention_head_size = int(hidden_size / num_attention_heads) - input_shape = get_shape_list(input_tensor, expected_rank=3) - batch_size = input_shape[0] - seq_length = input_shape[1] - input_width = input_shape[2] + attention_head_size = int(hidden_size / num_attention_heads) + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + input_width = input_shape[2] - # The Transformer performs sum residuals on all layers so the input needs - # to be the same as the hidden size. - if input_width != hidden_size: - raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % - (input_width, hidden_size)) + # The Transformer performs sum residuals on all layers so the input needs + # to be the same as the hidden size. + if input_width != hidden_size: + raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % + (input_width, hidden_size)) - # We keep the representation as a 2D tensor to avoid re-shaping it back and - # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on - # the GPU/CPU but may not be free on the TPU, so we want to minimize them to - # help the optimizer. - prev_output = reshape_to_matrix(input_tensor) + # We keep the representation as a 2D tensor to avoid re-shaping it back and + # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on + # the GPU/CPU but may not be free on the TPU, so we want to minimize them to + # help the optimizer. + prev_output = reshape_to_matrix(input_tensor) - all_layer_outputs = [] - for layer_idx in range(num_hidden_layers): - with tf.variable_scope("layer_%d" % layer_idx): - layer_input = prev_output + all_layer_outputs = [] + for layer_idx in range(num_hidden_layers): + with tf.variable_scope("layer_%d" % layer_idx): + layer_input = prev_output - with tf.variable_scope("attention"): - attention_heads = [] - with tf.variable_scope("self"): - attention_head = attention_layer( - from_tensor=layer_input, - to_tensor=layer_input, - attention_mask=attention_mask, - num_attention_heads=num_attention_heads, - size_per_head=attention_head_size, - attention_probs_dropout_prob=attention_probs_dropout_prob, - initializer_range=initializer_range, - do_return_2d_tensor=True, - batch_size=batch_size, - from_seq_length=seq_length, - to_seq_length=seq_length) - attention_heads.append(attention_head) + with tf.variable_scope("attention"): + attention_heads = [] + with tf.variable_scope("self"): + attention_head = attention_layer( + from_tensor=layer_input, + to_tensor=layer_input, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + do_return_2d_tensor=True, + batch_size=batch_size, + from_seq_length=seq_length, + to_seq_length=seq_length) + attention_heads.append(attention_head) - attention_output = None - if len(attention_heads) == 1: - attention_output = attention_heads[0] - else: - # In the case where we have other sequences, we just concatenate - # them to the self-attention head before the projection. - attention_output = tf.concat(attention_heads, axis=-1) + attention_output = None + if len(attention_heads) == 1: + attention_output = attention_heads[0] + else: + # In the case where we have other sequences, we just concatenate + # them to the self-attention head before the projection. + attention_output = tf.concat(attention_heads, axis=-1) - # Run a linear projection of `hidden_size` then add a residual - # with `layer_input`. - with tf.variable_scope("output"): - attention_output = tf.layers.dense( - attention_output, - hidden_size, - kernel_initializer=create_initializer(initializer_range)) - attention_output = dropout(attention_output, hidden_dropout_prob) - attention_output = layer_norm(attention_output + layer_input) + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output"): + attention_output = tf.layers.dense( + attention_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + attention_output = layer_norm(attention_output + layer_input) - # The activation is only applied to the "intermediate" hidden layer. - with tf.variable_scope("intermediate"): - intermediate_output = tf.layers.dense( - attention_output, - intermediate_size, - activation=intermediate_act_fn, - kernel_initializer=create_initializer(initializer_range)) + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope("intermediate"): + intermediate_output = tf.layers.dense( + attention_output, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=create_initializer(initializer_range)) - # Down-project back to `hidden_size` then add the residual. - with tf.variable_scope("output"): - layer_output = tf.layers.dense( - intermediate_output, - hidden_size, - kernel_initializer=create_initializer(initializer_range)) - layer_output = dropout(layer_output, hidden_dropout_prob) - layer_output = layer_norm(layer_output + attention_output) - prev_output = layer_output - all_layer_outputs.append(layer_output) + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope("output"): + layer_output = tf.layers.dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + layer_output = dropout(layer_output, hidden_dropout_prob) + layer_output = layer_norm(layer_output + attention_output) + prev_output = layer_output + all_layer_outputs.append(layer_output) - if do_return_all_layers: - final_outputs = [] - for layer_output in all_layer_outputs: - final_output = reshape_from_matrix(layer_output, input_shape) - final_outputs.append(final_output) - return final_outputs - else: - final_output = reshape_from_matrix(prev_output, input_shape) - return final_output + if do_return_all_layers: + final_outputs = [] + for layer_output in all_layer_outputs: + final_output = reshape_from_matrix(layer_output, input_shape) + final_outputs.append(final_output) + return final_outputs + else: + final_output = reshape_from_matrix(prev_output, input_shape) + return final_output def get_shape_list(tensor, expected_rank=None, name=None): - """Returns a list of the shape of tensor, preferring static dimensions. + """Returns a list of the shape of tensor, preferring static dimensions. - Args: - tensor: A tf.Tensor object to find the shape of. - expected_rank: (optional) int. The expected rank of `tensor`. If this is - specified and the `tensor` has a different rank, and exception will be - thrown. - name: Optional name of the tensor for the error message. + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. - Returns: - A list of dimensions of the shape of tensor. All static dimensions will - be returned as python integers, and dynamic dimensions will be returned - as tf.Tensor scalars. - """ - if name is None: - name = tensor.name + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name - if expected_rank is not None: - assert_rank(tensor, expected_rank, name) + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) - shape = tensor.shape.as_list() + shape = tensor.shape.as_list() - non_static_indexes = [] - for (index, dim) in enumerate(shape): - if dim is None: - non_static_indexes.append(index) + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) - if not non_static_indexes: + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] return shape - dyn_shape = tf.shape(tensor) - for index in non_static_indexes: - shape[index] = dyn_shape[index] - return shape - def reshape_to_matrix(input_tensor): - """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" - ndims = input_tensor.shape.ndims - if ndims < 2: - raise ValueError("Input tensor must have at least rank 2. Shape = %s" % - (input_tensor.shape)) - if ndims == 2: - return input_tensor + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor - width = input_tensor.shape[-1] - output_tensor = tf.reshape(input_tensor, [-1, width]) - return output_tensor + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor def reshape_from_matrix(output_tensor, orig_shape_list): - """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" - if len(orig_shape_list) == 2: - return output_tensor + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor - output_shape = get_shape_list(output_tensor) + output_shape = get_shape_list(output_tensor) - orig_dims = orig_shape_list[0:-1] - width = output_shape[-1] + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] - return tf.reshape(output_tensor, orig_dims + [width]) + return tf.reshape(output_tensor, orig_dims + [width]) def assert_rank(tensor, expected_rank, name=None): - """Raises an exception if the tensor rank is not of the expected rank. + """Raises an exception if the tensor rank is not of the expected rank. - Args: - tensor: A tf.Tensor to check the rank of. - expected_rank: Python integer or list of integers, expected rank. - name: Optional name of the tensor for the error message. + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. - Raises: - ValueError: If the expected shape doesn"t match the actual shape. - """ - if name is None: - name = tensor.name + Raises: + ValueError: If the expected shape doesn"t match the actual shape. + """ + if name is None: + name = tensor.name - expected_rank_dict = {} - if isinstance(expected_rank, six.integer_types): - expected_rank_dict[expected_rank] = True - else: - for x in expected_rank: - expected_rank_dict[x] = True + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True - actual_rank = tensor.shape.ndims - if actual_rank not in expected_rank_dict: - scope_name = tf.get_variable_scope().name - raise ValueError( - "For the tensor `%s` in scope `%s`, the actual rank " - "`%d` (shape = %s) is not equal to the expected rank `%s`" % - (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) diff --git a/modeling_test.py b/modeling_test.py index eb939b2ded..bed462f3c0 100644 --- a/modeling_test.py +++ b/modeling_test.py @@ -27,250 +27,249 @@ import tensorflow as tf class BertModelTest(tf.test.TestCase): + class BertModelTester(object): - class BertModelTester(object): + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + scope=None): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.scope = scope - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - initializer_range=0.02, - scope=None): - self.parent = parent - self.batch_size = batch_size - self.seq_length = seq_length - self.is_training = is_training - self.use_input_mask = use_input_mask - self.use_token_type_ids = use_token_type_ids - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.scope = scope + def create_model(self): + input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], + self.vocab_size) - def create_model(self): - input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], - self.vocab_size) + input_mask = None + if self.use_input_mask: + input_mask = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], vocab_size=2) - input_mask = None - if self.use_input_mask: - input_mask = BertModelTest.ids_tensor( - [self.batch_size, self.seq_length], vocab_size=2) + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], self.type_vocab_size) - token_type_ids = None - if self.use_token_type_ids: - token_type_ids = BertModelTest.ids_tensor( - [self.batch_size, self.seq_length], self.type_vocab_size) + config = modeling.BertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range) - config = modeling.BertConfig( - vocab_size=self.vocab_size, - hidden_size=self.hidden_size, - num_hidden_layers=self.num_hidden_layers, - num_attention_heads=self.num_attention_heads, - intermediate_size=self.intermediate_size, - hidden_act=self.hidden_act, - hidden_dropout_prob=self.hidden_dropout_prob, - attention_probs_dropout_prob=self.attention_probs_dropout_prob, - max_position_embeddings=self.max_position_embeddings, - type_vocab_size=self.type_vocab_size, - initializer_range=self.initializer_range) + model = modeling.BertModel( + config=config, + is_training=self.is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=token_type_ids, + scope=self.scope) - model = modeling.BertModel( - config=config, - is_training=self.is_training, - input_ids=input_ids, - input_mask=input_mask, - token_type_ids=token_type_ids, - scope=self.scope) + outputs = { + "embedding_output": model.get_embedding_output(), + "sequence_output": model.get_sequence_output(), + "pooled_output": model.get_pooled_output(), + "all_encoder_layers": model.get_all_encoder_layers(), + } + return outputs - outputs = { - "embedding_output": model.get_embedding_output(), - "sequence_output": model.get_sequence_output(), - "pooled_output": model.get_pooled_output(), - "all_encoder_layers": model.get_all_encoder_layers(), - } - return outputs + def check_output(self, result): + self.parent.assertAllEqual( + result["embedding_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) - def check_output(self, result): - self.parent.assertAllEqual( - result["embedding_output"].shape, - [self.batch_size, self.seq_length, self.hidden_size]) + self.parent.assertAllEqual( + result["sequence_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) - self.parent.assertAllEqual( - result["sequence_output"].shape, - [self.batch_size, self.seq_length, self.hidden_size]) + self.parent.assertAllEqual(result["pooled_output"].shape, + [self.batch_size, self.hidden_size]) - self.parent.assertAllEqual(result["pooled_output"].shape, - [self.batch_size, self.hidden_size]) + def test_default(self): + self.run_tester(BertModelTest.BertModelTester(self)) - def test_default(self): - self.run_tester(BertModelTest.BertModelTester(self)) + def test_config_to_json_string(self): + config = modeling.BertConfig(vocab_size=99, hidden_size=37) + obj = json.loads(config.to_json_string()) + self.assertEqual(obj["vocab_size"], 99) + self.assertEqual(obj["hidden_size"], 37) - def test_config_to_json_string(self): - config = modeling.BertConfig(vocab_size=99, hidden_size=37) - obj = json.loads(config.to_json_string()) - self.assertEqual(obj["vocab_size"], 99) - self.assertEqual(obj["hidden_size"], 37) + def run_tester(self, tester): + with self.test_session() as sess: + ops = tester.create_model() + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + output_result = sess.run(ops) + tester.check_output(output_result) - def run_tester(self, tester): - with self.test_session() as sess: - ops = tester.create_model() - init_op = tf.group(tf.global_variables_initializer(), - tf.local_variables_initializer()) - sess.run(init_op) - output_result = sess.run(ops) - tester.check_output(output_result) + self.assert_all_tensors_reachable(sess, [init_op, ops]) - self.assert_all_tensors_reachable(sess, [init_op, ops]) + @classmethod + def ids_tensor(cls, shape, vocab_size, rng=None, name=None): + """Creates a random int32 tensor of the shape within the vocab size.""" + if rng is None: + rng = random.Random() - @classmethod - def ids_tensor(cls, shape, vocab_size, rng=None, name=None): - """Creates a random int32 tensor of the shape within the vocab size.""" - if rng is None: - rng = random.Random() + total_dims = 1 + for dim in shape: + total_dims *= dim - total_dims = 1 - for dim in shape: - total_dims *= dim + values = [] + for _ in range(total_dims): + values.append(rng.randint(0, vocab_size - 1)) - values = [] - for _ in range(total_dims): - values.append(rng.randint(0, vocab_size - 1)) + return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name) - return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name) + def assert_all_tensors_reachable(self, sess, outputs): + """Checks that all the tensors in the graph are reachable from outputs.""" + graph = sess.graph - def assert_all_tensors_reachable(self, sess, outputs): - """Checks that all the tensors in the graph are reachable from outputs.""" - graph = sess.graph + ignore_strings = [ + "^.*/dilation_rate$", + "^.*/Tensordot/concat$", + "^.*/Tensordot/concat/axis$", + "^testing/.*$", + ] - ignore_strings = [ - "^.*/dilation_rate$", - "^.*/Tensordot/concat$", - "^.*/Tensordot/concat/axis$", - "^testing/.*$", - ] + ignore_regexes = [re.compile(x) for x in ignore_strings] - ignore_regexes = [re.compile(x) for x in ignore_strings] + unreachable = self.get_unreachable_ops(graph, outputs) + filtered_unreachable = [] + for x in unreachable: + do_ignore = False + for r in ignore_regexes: + m = r.match(x.name) + if m is not None: + do_ignore = True + if do_ignore: + continue + filtered_unreachable.append(x) + unreachable = filtered_unreachable - unreachable = self.get_unreachable_ops(graph, outputs) - filtered_unreachable = [] - for x in unreachable: - do_ignore = False - for r in ignore_regexes: - m = r.match(x.name) - if m is not None: - do_ignore = True - if do_ignore: - continue - filtered_unreachable.append(x) - unreachable = filtered_unreachable + self.assertEqual( + len(unreachable), 0, "The following ops are unreachable: %s" % + (" ".join([x.name for x in unreachable]))) - self.assertEqual( - len(unreachable), 0, "The following ops are unreachable: %s" % - (" ".join([x.name for x in unreachable]))) + @classmethod + def get_unreachable_ops(cls, graph, outputs): + """Finds all of the tensors in graph that are unreachable from outputs.""" + outputs = cls.flatten_recursive(outputs) + output_to_op = collections.defaultdict(list) + op_to_all = collections.defaultdict(list) + assign_out_to_in = collections.defaultdict(list) - @classmethod - def get_unreachable_ops(cls, graph, outputs): - """Finds all of the tensors in graph that are unreachable from outputs.""" - outputs = cls.flatten_recursive(outputs) - output_to_op = collections.defaultdict(list) - op_to_all = collections.defaultdict(list) - assign_out_to_in = collections.defaultdict(list) + for op in graph.get_operations(): + for x in op.inputs: + op_to_all[op.name].append(x.name) + for y in op.outputs: + output_to_op[y.name].append(op.name) + op_to_all[op.name].append(y.name) + if str(op.type) == "Assign": + for y in op.outputs: + for x in op.inputs: + assign_out_to_in[y.name].append(x.name) - for op in graph.get_operations(): - for x in op.inputs: - op_to_all[op.name].append(x.name) - for y in op.outputs: - output_to_op[y.name].append(op.name) - op_to_all[op.name].append(y.name) - if str(op.type) == "Assign": - for y in op.outputs: - for x in op.inputs: - assign_out_to_in[y.name].append(x.name) + assign_groups = collections.defaultdict(list) + for out_name in assign_out_to_in.keys(): + name_group = assign_out_to_in[out_name] + for n1 in name_group: + assign_groups[n1].append(out_name) + for n2 in name_group: + if n1 != n2: + assign_groups[n1].append(n2) - assign_groups = collections.defaultdict(list) - for out_name in assign_out_to_in.keys(): - name_group = assign_out_to_in[out_name] - for n1 in name_group: - assign_groups[n1].append(out_name) - for n2 in name_group: - if n1 != n2: - assign_groups[n1].append(n2) + seen_tensors = {} + stack = [x.name for x in outputs] + while stack: + name = stack.pop() + if name in seen_tensors: + continue + seen_tensors[name] = True - seen_tensors = {} - stack = [x.name for x in outputs] - while stack: - name = stack.pop() - if name in seen_tensors: - continue - seen_tensors[name] = True + if name in output_to_op: + for op_name in output_to_op[name]: + if op_name in op_to_all: + for input_name in op_to_all[op_name]: + if input_name not in stack: + stack.append(input_name) - if name in output_to_op: - for op_name in output_to_op[name]: - if op_name in op_to_all: - for input_name in op_to_all[op_name]: - if input_name not in stack: - stack.append(input_name) + expanded_names = [] + if name in assign_groups: + for assign_name in assign_groups[name]: + expanded_names.append(assign_name) - expanded_names = [] - if name in assign_groups: - for assign_name in assign_groups[name]: - expanded_names.append(assign_name) + for expanded_name in expanded_names: + if expanded_name not in stack: + stack.append(expanded_name) - for expanded_name in expanded_names: - if expanded_name not in stack: - stack.append(expanded_name) + unreachable_ops = [] + for op in graph.get_operations(): + is_unreachable = False + all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs] + for name in all_names: + if name not in seen_tensors: + is_unreachable = True + if is_unreachable: + unreachable_ops.append(op) + return unreachable_ops - unreachable_ops = [] - for op in graph.get_operations(): - is_unreachable = False - all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs] - for name in all_names: - if name not in seen_tensors: - is_unreachable = True - if is_unreachable: - unreachable_ops.append(op) - return unreachable_ops + @classmethod + def flatten_recursive(cls, item): + """Flattens (potentially nested) a tuple/dictionary/list to a list.""" + output = [] + if isinstance(item, list): + output.extend(item) + elif isinstance(item, tuple): + output.extend(list(item)) + elif isinstance(item, dict): + for (_, v) in six.iteritems(item): + output.append(v) + else: + return [item] - @classmethod - def flatten_recursive(cls, item): - """Flattens (potentially nested) a tuple/dictionary/list to a list.""" - output = [] - if isinstance(item, list): - output.extend(item) - elif isinstance(item, tuple): - output.extend(list(item)) - elif isinstance(item, dict): - for (_, v) in six.iteritems(item): - output.append(v) - else: - return [item] - - flat_output = [] - for x in output: - flat_output.extend(cls.flatten_recursive(x)) - return flat_output + flat_output = [] + for x in output: + flat_output.extend(cls.flatten_recursive(x)) + return flat_output if __name__ == "__main__": - tf.test.main() + tf.test.main() diff --git a/optimization.py b/optimization.py index 1038bb5b84..72dcd76398 100644 --- a/optimization.py +++ b/optimization.py @@ -23,149 +23,149 @@ import tensorflow as tf def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): - """Creates an optimizer training op.""" - global_step = tf.train.get_or_create_global_step() + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() - learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) - # Implements linear decay of the learning rate. - learning_rate = tf.train.polynomial_decay( - learning_rate, - global_step, - num_train_steps, - end_learning_rate=0.0, - power=1.0, - cycle=False) + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) - # Implements linear warmup. I.e., if global_step < num_warmup_steps, the - # learning rate will be `global_step/num_warmup_steps * init_lr`. - if num_warmup_steps: - global_steps_int = tf.cast(global_step, tf.int32) - warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) - global_steps_float = tf.cast(global_steps_int, tf.float32) - warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) - warmup_percent_done = global_steps_float / warmup_steps_float - warmup_learning_rate = init_lr * warmup_percent_done + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done - is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) - learning_rate = ( - (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) - # It is recommended that you use this optimizer for fine tuning, since this - # is how the model was trained (note that the Adam m/v variables are NOT - # loaded from init_checkpoint.) - optimizer = AdamWeightDecayOptimizer( - learning_rate=learning_rate, - weight_decay_rate=0.01, - beta_1=0.9, - beta_2=0.999, - epsilon=1e-6, - exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) - if use_tpu: - optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) - tvars = tf.trainable_variables() - grads = tf.gradients(loss, tvars) + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) - # This is how the model was pre-trained. - (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) - train_op = optimizer.apply_gradients( - zip(grads, tvars), global_step=global_step) + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) - new_global_step = global_step + 1 - train_op = tf.group(train_op, [global_step.assign(new_global_step)]) - return train_op + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op class AdamWeightDecayOptimizer(tf.train.Optimizer): - """A basic Adam optimizer that includes "correct" L2 weight decay.""" + """A basic Adam optimizer that includes "correct" L2 weight decay.""" - def __init__(self, - learning_rate, - weight_decay_rate=0.0, - beta_1=0.9, - beta_2=0.999, - epsilon=1e-6, - exclude_from_weight_decay=None, - name="AdamWeightDecayOptimizer"): - """Constructs a AdamWeightDecayOptimizer.""" - super(AdamWeightDecayOptimizer, self).__init__(False, name) + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) - self.learning_rate = learning_rate - self.weight_decay_rate = weight_decay_rate - self.beta_1 = beta_1 - self.beta_2 = beta_2 - self.epsilon = epsilon - self.exclude_from_weight_decay = exclude_from_weight_decay + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay - def apply_gradients(self, grads_and_vars, global_step=None, name=None): - """See base class.""" - assignments = [] - for (grad, param) in grads_and_vars: - if grad is None or param is None: - continue + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue - param_name = self._get_variable_name(param.name) + param_name = self._get_variable_name(param.name) - m = tf.get_variable( - name=param_name + "/adam_m", - shape=param.shape.as_list(), - dtype=tf.float32, - trainable=False, - initializer=tf.zeros_initializer()) - v = tf.get_variable( - name=param_name + "/adam_v", - shape=param.shape.as_list(), - dtype=tf.float32, - trainable=False, - initializer=tf.zeros_initializer()) + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) - # Standard Adam update. - next_m = ( - tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) - next_v = ( - tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, - tf.square(grad))) + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) - update = next_m / (tf.sqrt(next_v) + self.epsilon) + update = next_m / (tf.sqrt(next_v) + self.epsilon) - # Just adding the square of the weights to the loss function is *not* - # the correct way of using L2 regularization/weight decay with Adam, - # since that will interact with the m and v parameters in strange ways. - # - # Instead we want ot decay the weights in a manner that doesn't interact - # with the m/v parameters. This is equivalent to adding the square - # of the weights to the loss with plain (non-momentum) SGD. - if self._do_use_weight_decay(param_name): - update += self.weight_decay_rate * param + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param - update_with_lr = self.learning_rate * update + update_with_lr = self.learning_rate * update - next_param = param - update_with_lr + next_param = param - update_with_lr - assignments.extend( - [param.assign(next_param), - m.assign(next_m), - v.assign(next_v)]) - return tf.group(*assignments, name=name) + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) - def _do_use_weight_decay(self, param_name): - """Whether to use L2 weight decay for `param_name`.""" - if not self.weight_decay_rate: - return False - if self.exclude_from_weight_decay: - for r in self.exclude_from_weight_decay: - if re.search(r, param_name) is not None: - return False - return True + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True - def _get_variable_name(self, param_name): - """Get the variable name from the tensor name.""" - m = re.match("^(.*):\\d+$", param_name) - if m is not None: - param_name = m.group(1) - return param_name + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/optimization_test.py b/optimization_test.py index 4f2dcf133f..266ececd2d 100644 --- a/optimization_test.py +++ b/optimization_test.py @@ -22,27 +22,27 @@ import tensorflow as tf class OptimizationTest(tf.test.TestCase): - def test_adam(self): - with self.test_session() as sess: - w = tf.get_variable( - "w", - shape=[3], - initializer=tf.constant_initializer([0.1, -0.2, -0.1])) - x = tf.constant([0.4, 0.2, -0.5]) - loss = tf.reduce_mean(tf.square(x - w)) - tvars = tf.trainable_variables() - grads = tf.gradients(loss, tvars) - global_step = tf.train.get_or_create_global_step() - optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) - train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) - init_op = tf.group(tf.global_variables_initializer(), - tf.local_variables_initializer()) - sess.run(init_op) - for _ in range(100): - sess.run(train_op) - w_np = sess.run(w) - self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) + def test_adam(self): + with self.test_session() as sess: + w = tf.get_variable( + "w", + shape=[3], + initializer=tf.constant_initializer([0.1, -0.2, -0.1])) + x = tf.constant([0.4, 0.2, -0.5]) + loss = tf.reduce_mean(tf.square(x - w)) + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + global_step = tf.train.get_or_create_global_step() + optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) + train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + for _ in range(100): + sess.run(train_op) + w_np = sess.run(w) + self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) if __name__ == "__main__": - tf.test.main() + tf.test.main() diff --git a/run_classifier.py b/run_classifier.py index aa35490f8b..a1ab100229 100644 --- a/run_classifier.py +++ b/run_classifier.py @@ -118,583 +118,583 @@ flags.DEFINE_integer( class InputExample(object): - """A single training/test example for simple sequence classification.""" + """A single training/test example for simple sequence classification.""" - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label class InputFeatures(object): - """A single set of features of data.""" + """A single set of features of data.""" - def __init__(self, input_ids, input_mask, segment_ids, label_id): - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id + def __init__(self, input_ids, input_mask, segment_ids, label_id): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" + """Base class for data converters for sequence classification data sets.""" - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with tf.gfile.Open(input_file, "r") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - lines.append(line) - return lines + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines class MnliProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" + """Processor for the MultiNLI data set (GLUE version).""" - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), - "dev_matched") + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) - text_a = tokenization.convert_to_unicode(line[8]) - text_b = tokenization.convert_to_unicode(line[9]) - label = tokenization.convert_to_unicode(line[-1]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) + text_a = tokenization.convert_to_unicode(line[8]) + text_b = tokenization.convert_to_unicode(line[9]) + label = tokenization.convert_to_unicode(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples class MrpcProcessor(DataProcessor): - """Processor for the MRPC data set (GLUE version).""" + """Processor for the MRPC data set (GLUE version).""" - def get_train_examples(self, data_dir): - """See base class.""" - print("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv"))) - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + def get_train_examples(self, data_dir): + """See base class.""" + print("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv"))) + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - def get_labels(self): - """See base class.""" - return ["0", "1"] + def get_labels(self): + """See base class.""" + return ["0", "1"] - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = tokenization.convert_to_unicode(line[3]) - text_b = tokenization.convert_to_unicode(line[4]) - label = tokenization.convert_to_unicode(line[0]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = tokenization.convert_to_unicode(line[4]) + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples class ColaProcessor(DataProcessor): - """Processor for the CoLA data set (GLUE version).""" + """Processor for the CoLA data set (GLUE version).""" - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - def get_labels(self): - """See base class.""" - return ["0", "1"] + def get_labels(self): + """See base class.""" + return ["0", "1"] - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s" % (set_type, i) - text_a = tokenization.convert_to_unicode(line[3]) - label = tokenization.convert_to_unicode(line[1]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): - """Loads a data file into a list of `InputBatch`s.""" + """Loads a data file into a list of `InputBatch`s.""" - label_map = {} - for (i, label) in enumerate(label_list): - label_map[label] = i + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i - features = [] - for (ex_index, example) in enumerate(examples): - tokens_a = tokenizer.tokenize(example.text_a) + features = [] + for (ex_index, example) in enumerate(examples): + tokens_a = tokenizer.tokenize(example.text_a) - tokens_b = None - if example.text_b: - tokens_b = tokenizer.tokenize(example.text_b) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) - if tokens_b: - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) - else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[0:(max_seq_length - 2)] + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambigiously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = [] - segment_ids = [] - tokens.append("[CLS]") - segment_ids.append(0) - for token in tokens_a: - tokens.append(token) - segment_ids.append(0) - tokens.append("[SEP]") - segment_ids.append(0) + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambigiously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) - if tokens_b: - for token in tokens_b: - tokens.append(token) - segment_ids.append(1) - tokens.append("[SEP]") - segment_ids.append(1) + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) - input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_ids = tokenizer.convert_tokens_to_ids(tokens) - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1] * len(input_ids) + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(0) - input_mask.append(0) - segment_ids.append(0) + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length - label_id = label_map[example.label] - if ex_index < 5: - tf.logging.info("*** Example ***") - tf.logging.info("guid: %s" % (example.guid)) - tf.logging.info("tokens: %s" % " ".join( - [tokenization.printable_text(x) for x in tokens])) - tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - tf.logging.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) - features.append( - InputFeatures( - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - label_id=label_id)) - return features + features.append( + InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id)) + return features def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" + """Truncates a sequence pair in place to the maximum length.""" - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings): - """Creates a classification model.""" - model = modeling.BertModel( - config=bert_config, - is_training=is_training, - input_ids=input_ids, - input_mask=input_mask, - token_type_ids=segment_ids, - use_one_hot_embeddings=use_one_hot_embeddings) + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) - # In the demo, we are doing a simple classification task on the entire - # segment. - # - # If you want to use the token-level output, use model.get_sequence_output() - # instead. - output_layer = model.get_pooled_output() + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use model.get_sequence_output() + # instead. + output_layer = model.get_pooled_output() - hidden_size = output_layer.shape[-1].value + hidden_size = output_layer.shape[-1].value - output_weights = tf.get_variable( - "output_weights", [num_labels, hidden_size], - initializer=tf.truncated_normal_initializer(stddev=0.02)) + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) - output_bias = tf.get_variable( - "output_bias", [num_labels], initializer=tf.zeros_initializer()) + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) - with tf.variable_scope("loss"): - if is_training: - # I.e., 0.1 dropout - output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) - logits = tf.matmul(output_layer, output_weights, transpose_b=True) - logits = tf.nn.bias_add(logits, output_bias) - log_probs = tf.nn.log_softmax(logits, axis=-1) + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) - one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) - per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) - loss = tf.reduce_mean(per_example_loss) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) - return (loss, per_example_loss, logits) + return (loss, per_example_loss, logits) def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings): - """Returns `model_fn` closure for TPUEstimator.""" + """Returns `model_fn` closure for TPUEstimator.""" - def model_fn(features, labels, mode, params): # pylint: disable=unused-argument - """The `model_fn` for TPUEstimator.""" + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" - tf.logging.info("*** Features ***") - for name in sorted(features.keys()): - tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) - input_ids = features["input_ids"] - input_mask = features["input_mask"] - segment_ids = features["segment_ids"] - label_ids = features["label_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] - is_training = (mode == tf.estimator.ModeKeys.TRAIN) + is_training = (mode == tf.estimator.ModeKeys.TRAIN) - (total_loss, per_example_loss, logits) = create_model( - bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, - num_labels, use_one_hot_embeddings) + (total_loss, per_example_loss, logits) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) - tvars = tf.trainable_variables() + tvars = tf.trainable_variables() - scaffold_fn = None - if init_checkpoint: - (assignment_map, - initialized_variable_names) = modeling.get_assigment_map_from_checkpoint( - tvars, init_checkpoint) - if use_tpu: + scaffold_fn = None + if init_checkpoint: + (assignment_map, + initialized_variable_names) = modeling.get_assigment_map_from_checkpoint( + tvars, init_checkpoint) + if use_tpu: - def tpu_scaffold(): - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - return tf.train.Scaffold() + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() - scaffold_fn = tpu_scaffold - else: - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - tf.logging.info("**** Trainable Variables ****") - for var in tvars: - init_string = "" - if var.name in initialized_variable_names: - init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, - init_string) + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) - output_spec = None - if mode == tf.estimator.ModeKeys.TRAIN: + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: - train_op = optimization.create_optimizer( - total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) - output_spec = tf.contrib.tpu.TPUEstimatorSpec( - mode=mode, - loss=total_loss, - train_op=train_op, - scaffold_fn=scaffold_fn) - elif mode == tf.estimator.ModeKeys.EVAL: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: - def metric_fn(per_example_loss, label_ids, logits): - predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) - accuracy = tf.metrics.accuracy(label_ids, predictions) - loss = tf.metrics.mean(per_example_loss) - return { - "eval_accuracy": accuracy, - "eval_loss": loss, - } + def metric_fn(per_example_loss, label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy(label_ids, predictions) + loss = tf.metrics.mean(per_example_loss) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } - eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) - output_spec = tf.contrib.tpu.TPUEstimatorSpec( - mode=mode, - loss=total_loss, - eval_metrics=eval_metrics, - scaffold_fn=scaffold_fn) - else: - raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) - return output_spec + return output_spec - return model_fn + return model_fn def input_fn_builder(features, seq_length, is_training, drop_remainder): - """Creates an `input_fn` closure to be passed to TPUEstimator.""" + """Creates an `input_fn` closure to be passed to TPUEstimator.""" - all_input_ids = [] - all_input_mask = [] - all_segment_ids = [] - all_label_ids = [] + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] - for feature in features: - all_input_ids.append(feature.input_ids) - all_input_mask.append(feature.input_mask) - all_segment_ids.append(feature.segment_ids) - all_label_ids.append(feature.label_id) + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) - def input_fn(params): - """The actual input function.""" - batch_size = params["batch_size"] + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] - num_examples = len(features) + num_examples = len(features) - # This is for demo purposes and does NOT scale to large data sets. We do - # not use Dataset.from_generator() because that uses tf.py_func which is - # not TPU compatible. The right way to load data is with TFRecordReader. - d = tf.data.Dataset.from_tensor_slices({ - "input_ids": - tf.constant( - all_input_ids, shape=[num_examples, seq_length], - dtype=tf.int32), - "input_mask": - tf.constant( - all_input_mask, - shape=[num_examples, seq_length], - dtype=tf.int32), - "segment_ids": - tf.constant( - all_segment_ids, - shape=[num_examples, seq_length], - dtype=tf.int32), - "label_ids": - tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), - }) + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) - if is_training: - d = d.repeat() - d = d.shuffle(buffer_size=100) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) - d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) - return d + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d - return input_fn + return input_fn def main(_): - tf.logging.set_verbosity(tf.logging.INFO) + tf.logging.set_verbosity(tf.logging.INFO) - processors = { - "cola": ColaProcessor, - "mnli": MnliProcessor, - "mrpc": MrpcProcessor, - } + processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mrpc": MrpcProcessor, + } - if not FLAGS.do_train and not FLAGS.do_eval: - raise ValueError("At least one of `do_train` or `do_eval` must be True.") + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") - bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) - if FLAGS.max_seq_length > bert_config.max_position_embeddings: - raise ValueError( - "Cannot use sequence length %d because the BERT model " - "was only trained up to sequence length %d" % - (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) - tf.gfile.MakeDirs(FLAGS.output_dir) + tf.gfile.MakeDirs(FLAGS.output_dir) - task_name = FLAGS.task_name.lower() + task_name = FLAGS.task_name.lower() - if task_name not in processors: - raise ValueError("Task not found: %s" % (task_name)) + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) - processor = processors[task_name]() + processor = processors[task_name]() - label_list = processor.get_labels() + label_list = processor.get_labels() - tokenizer = tokenization.FullTokenizer( - vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) - tpu_cluster_resolver = None - if FLAGS.use_tpu and FLAGS.tpu_name: - tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( - FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) - is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 - run_config = tf.contrib.tpu.RunConfig( - cluster=tpu_cluster_resolver, - master=FLAGS.master, - model_dir=FLAGS.output_dir, - save_checkpoints_steps=FLAGS.save_checkpoints_steps, - tpu_config=tf.contrib.tpu.TPUConfig( - iterations_per_loop=FLAGS.iterations_per_loop, - num_shards=FLAGS.num_tpu_cores, - per_host_input_for_training=is_per_host)) + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) - train_examples = None - num_train_steps = None - num_warmup_steps = None - if FLAGS.do_train: - train_examples = processor.get_train_examples(FLAGS.data_dir) - num_train_steps = int( - len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) - num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) - model_fn = model_fn_builder( - bert_config=bert_config, - num_labels=len(label_list), - init_checkpoint=FLAGS.init_checkpoint, - learning_rate=FLAGS.learning_rate, - num_train_steps=num_train_steps, - num_warmup_steps=num_warmup_steps, - use_tpu=FLAGS.use_tpu, - use_one_hot_embeddings=FLAGS.use_tpu) + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) - # If TPU is not available, this will fall back to normal Estimator on CPU - # or GPU. - estimator = tf.contrib.tpu.TPUEstimator( - use_tpu=FLAGS.use_tpu, - model_fn=model_fn, - config=run_config, - train_batch_size=FLAGS.train_batch_size, - eval_batch_size=FLAGS.eval_batch_size) + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size) - if FLAGS.do_train: - train_features = convert_examples_to_features( - train_examples, label_list, FLAGS.max_seq_length, tokenizer) - tf.logging.info("***** Running training *****") - tf.logging.info(" Num examples = %d", len(train_examples)) - tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) - tf.logging.info(" Num steps = %d", num_train_steps) - train_input_fn = input_fn_builder( - features=train_features, - seq_length=FLAGS.max_seq_length, - is_training=True, - drop_remainder=True) - estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + if FLAGS.do_train: + train_features = convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = input_fn_builder( + features=train_features, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) - if FLAGS.do_eval: - eval_examples = processor.get_dev_examples(FLAGS.data_dir) - eval_features = convert_examples_to_features( - eval_examples, label_list, FLAGS.max_seq_length, tokenizer) + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_features = convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer) - tf.logging.info("***** Running evaluation *****") - tf.logging.info(" Num examples = %d", len(eval_examples)) - tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) - # This tells the estimator to run through the entire set. - eval_steps = None - # However, if running eval on the TPU, you will need to specify the - # number of steps. - if FLAGS.use_tpu: - # Eval will be slightly WRONG on the TPU because it will truncate - # the last batch. - eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + # Eval will be slightly WRONG on the TPU because it will truncate + # the last batch. + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) - eval_drop_remainder = True if FLAGS.use_tpu else False - eval_input_fn = input_fn_builder( - features=eval_features, - seq_length=FLAGS.max_seq_length, - is_training=False, - drop_remainder=eval_drop_remainder) + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = input_fn_builder( + features=eval_features, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) - result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) - output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") - with tf.gfile.GFile(output_eval_file, "w") as writer: - tf.logging.info("***** Eval results *****") - for key in sorted(result.keys()): - tf.logging.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) if __name__ == "__main__": - flags.mark_flag_as_required("data_dir") - flags.mark_flag_as_required("task_name") - flags.mark_flag_as_required("vocab_file") - flags.mark_flag_as_required("bert_config_file") - flags.mark_flag_as_required("output_dir") - tf.app.run() + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/run_pretraining.py b/run_pretraining.py index 2821003a92..4636f205cd 100644 --- a/run_pretraining.py +++ b/run_pretraining.py @@ -109,217 +109,217 @@ flags.DEFINE_integer( def model_fn_builder(bert_config, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings): - """Returns `model_fn` closure for TPUEstimator.""" + """Returns `model_fn` closure for TPUEstimator.""" - def model_fn(features, labels, mode, params): # pylint: disable=unused-argument - """The `model_fn` for TPUEstimator.""" + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" - tf.logging.info("*** Features ***") - for name in sorted(features.keys()): - tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) - input_ids = features["input_ids"] - input_mask = features["input_mask"] - segment_ids = features["segment_ids"] - masked_lm_positions = features["masked_lm_positions"] - masked_lm_ids = features["masked_lm_ids"] - masked_lm_weights = features["masked_lm_weights"] - next_sentence_labels = features["next_sentence_labels"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + masked_lm_positions = features["masked_lm_positions"] + masked_lm_ids = features["masked_lm_ids"] + masked_lm_weights = features["masked_lm_weights"] + next_sentence_labels = features["next_sentence_labels"] - is_training = (mode == tf.estimator.ModeKeys.TRAIN) + is_training = (mode == tf.estimator.ModeKeys.TRAIN) - model = modeling.BertModel( - config=bert_config, - is_training=is_training, - input_ids=input_ids, - input_mask=input_mask, - token_type_ids=segment_ids, - use_one_hot_embeddings=use_one_hot_embeddings) + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) - (masked_lm_loss, - masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( - bert_config, model.get_sequence_output(), model.get_embedding_table(), - masked_lm_positions, masked_lm_ids, masked_lm_weights) + (masked_lm_loss, + masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( + bert_config, model.get_sequence_output(), model.get_embedding_table(), + masked_lm_positions, masked_lm_ids, masked_lm_weights) - (next_sentence_loss, next_sentence_example_loss, - next_sentence_log_probs) = get_next_sentence_output( - bert_config, model.get_pooled_output(), next_sentence_labels) + (next_sentence_loss, next_sentence_example_loss, + next_sentence_log_probs) = get_next_sentence_output( + bert_config, model.get_pooled_output(), next_sentence_labels) - total_loss = masked_lm_loss + next_sentence_loss + total_loss = masked_lm_loss + next_sentence_loss - tvars = tf.trainable_variables() + tvars = tf.trainable_variables() - initialized_variable_names = {} - scaffold_fn = None - if init_checkpoint: - (assignment_map, - initialized_variable_names) = modeling.get_assigment_map_from_checkpoint( - tvars, init_checkpoint) - if use_tpu: + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, + initialized_variable_names) = modeling.get_assigment_map_from_checkpoint( + tvars, init_checkpoint) + if use_tpu: - def tpu_scaffold(): - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - return tf.train.Scaffold() + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() - scaffold_fn = tpu_scaffold - else: - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - tf.logging.info("**** Trainable Variables ****") - for var in tvars: - init_string = "" - if var.name in initialized_variable_names: - init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, - init_string) + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) - output_spec = None - if mode == tf.estimator.ModeKeys.TRAIN: - train_op = optimization.create_optimizer( - total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) - output_spec = tf.contrib.tpu.TPUEstimatorSpec( - mode=mode, - loss=total_loss, - train_op=train_op, - scaffold_fn=scaffold_fn) - elif mode == tf.estimator.ModeKeys.EVAL: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: - def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, - masked_lm_weights, next_sentence_example_loss, - next_sentence_log_probs, next_sentence_labels): - """Computes the loss and accuracy of the model.""" - masked_lm_log_probs = tf.reshape(masked_lm_log_probs, - [-1, masked_lm_log_probs.shape[-1]]) - masked_lm_predictions = tf.argmax( - masked_lm_log_probs, axis=-1, output_type=tf.int32) - masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) - masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) - masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) - masked_lm_accuracy = tf.metrics.accuracy( - labels=masked_lm_ids, - predictions=masked_lm_predictions, - weights=masked_lm_weights) - masked_lm_mean_loss = tf.metrics.mean( - values=masked_lm_example_loss, weights=masked_lm_weights) + def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels): + """Computes the loss and accuracy of the model.""" + masked_lm_log_probs = tf.reshape(masked_lm_log_probs, + [-1, masked_lm_log_probs.shape[-1]]) + masked_lm_predictions = tf.argmax( + masked_lm_log_probs, axis=-1, output_type=tf.int32) + masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) + masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) + masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) + masked_lm_accuracy = tf.metrics.accuracy( + labels=masked_lm_ids, + predictions=masked_lm_predictions, + weights=masked_lm_weights) + masked_lm_mean_loss = tf.metrics.mean( + values=masked_lm_example_loss, weights=masked_lm_weights) - next_sentence_log_probs = tf.reshape( - next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) - next_sentence_predictions = tf.argmax( - next_sentence_log_probs, axis=-1, output_type=tf.int32) - next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) - next_sentence_accuracy = tf.metrics.accuracy( - labels=next_sentence_labels, predictions=next_sentence_predictions) - next_sentence_mean_loss = tf.metrics.mean( - values=next_sentence_example_loss) + next_sentence_log_probs = tf.reshape( + next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) + next_sentence_predictions = tf.argmax( + next_sentence_log_probs, axis=-1, output_type=tf.int32) + next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) + next_sentence_accuracy = tf.metrics.accuracy( + labels=next_sentence_labels, predictions=next_sentence_predictions) + next_sentence_mean_loss = tf.metrics.mean( + values=next_sentence_example_loss) - return { - "masked_lm_accuracy": masked_lm_accuracy, - "masked_lm_loss": masked_lm_mean_loss, - "next_sentence_accuracy": next_sentence_accuracy, - "next_sentence_loss": next_sentence_mean_loss, - } + return { + "masked_lm_accuracy": masked_lm_accuracy, + "masked_lm_loss": masked_lm_mean_loss, + "next_sentence_accuracy": next_sentence_accuracy, + "next_sentence_loss": next_sentence_mean_loss, + } - eval_metrics = (metric_fn, [ - masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, - masked_lm_weights, next_sentence_example_loss, - next_sentence_log_probs, next_sentence_labels - ]) - output_spec = tf.contrib.tpu.TPUEstimatorSpec( - mode=mode, - loss=total_loss, - eval_metrics=eval_metrics, - scaffold_fn=scaffold_fn) - else: - raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) + eval_metrics = (metric_fn, [ + masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels + ]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) - return output_spec + return output_spec - return model_fn + return model_fn def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): - """Get loss and log probs for the masked LM.""" - input_tensor = gather_indexes(input_tensor, positions) + """Get loss and log probs for the masked LM.""" + input_tensor = gather_indexes(input_tensor, positions) - with tf.variable_scope("cls/predictions"): - # We apply one more non-linear transformation before the output layer. - # This matrix is not used after pre-training. - with tf.variable_scope("transform"): - input_tensor = tf.layers.dense( - input_tensor, - units=bert_config.hidden_size, - activation=modeling.get_activation(bert_config.hidden_act), - kernel_initializer=modeling.create_initializer( - bert_config.initializer_range)) - input_tensor = modeling.layer_norm(input_tensor) + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=bert_config.hidden_size, + activation=modeling.get_activation(bert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - output_bias = tf.get_variable( - "output_bias", - shape=[bert_config.vocab_size], - initializer=tf.zeros_initializer()) - logits = tf.matmul(input_tensor, output_weights, transpose_b=True) - logits = tf.nn.bias_add(logits, output_bias) - log_probs = tf.nn.log_softmax(logits, axis=-1) + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[bert_config.vocab_size], + initializer=tf.zeros_initializer()) + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) - label_ids = tf.reshape(label_ids, [-1]) - label_weights = tf.reshape(label_weights, [-1]) + label_ids = tf.reshape(label_ids, [-1]) + label_weights = tf.reshape(label_weights, [-1]) - one_hot_labels = tf.one_hot( - label_ids, depth=bert_config.vocab_size, dtype=tf.float32) + one_hot_labels = tf.one_hot( + label_ids, depth=bert_config.vocab_size, dtype=tf.float32) - # The `positions` tensor might be zero-padded (if the sequence is too - # short to have the maximum number of predictions). The `label_weights` - # tensor has a value of 1.0 for every real prediction and 0.0 for the - # padding predictions. - per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) - numerator = tf.reduce_sum(label_weights * per_example_loss) - denominator = tf.reduce_sum(label_weights) + 1e-5 - loss = numerator / denominator + # The `positions` tensor might be zero-padded (if the sequence is too + # short to have the maximum number of predictions). The `label_weights` + # tensor has a value of 1.0 for every real prediction and 0.0 for the + # padding predictions. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + numerator = tf.reduce_sum(label_weights * per_example_loss) + denominator = tf.reduce_sum(label_weights) + 1e-5 + loss = numerator / denominator - return (loss, per_example_loss, log_probs) - - -def get_next_sentence_output(bert_config, input_tensor, labels): - """Get loss and log probs for the next sentence prediction.""" - - # Simple binary classification. Note that 0 is "next sentence" and 1 is - # "random sentence". This weight matrix is not used after pre-training. - with tf.variable_scope("cls/seq_relationship"): - output_weights = tf.get_variable( - "output_weights", - shape=[2, bert_config.hidden_size], - initializer=modeling.create_initializer(bert_config.initializer_range)) - output_bias = tf.get_variable( - "output_bias", shape=[2], initializer=tf.zeros_initializer()) - - logits = tf.matmul(input_tensor, output_weights, transpose_b=True) - logits = tf.nn.bias_add(logits, output_bias) - log_probs = tf.nn.log_softmax(logits, axis=-1) - labels = tf.reshape(labels, [-1]) - one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) - per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) - loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs) -def gather_indexes(sequence_tensor, positions): - """Gathers the vectors at the specific positions over a minibatch.""" - sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) - batch_size = sequence_shape[0] - seq_length = sequence_shape[1] - width = sequence_shape[2] +def get_next_sentence_output(bert_config, input_tensor, labels): + """Get loss and log probs for the next sentence prediction.""" - flat_offsets = tf.reshape( - tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) - flat_positions = tf.reshape(positions + flat_offsets, [-1]) - flat_sequence_tensor = tf.reshape(sequence_tensor, - [batch_size * seq_length, width]) - output_tensor = tf.gather(flat_sequence_tensor, flat_positions) - return output_tensor + # Simple binary classification. Note that 0 is "next sentence" and 1 is + # "random sentence". This weight matrix is not used after pre-training. + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, bert_config.hidden_size], + initializer=modeling.create_initializer(bert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + labels = tf.reshape(labels, [-1]) + one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + return (loss, per_example_loss, log_probs) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor def input_fn_builder(input_files, @@ -327,168 +327,168 @@ def input_fn_builder(input_files, max_predictions_per_seq, is_training, num_cpu_threads=4): - """Creates an `input_fn` closure to be passed to TPUEstimator.""" + """Creates an `input_fn` closure to be passed to TPUEstimator.""" - def input_fn(params): - """The actual input function.""" - batch_size = params["batch_size"] + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] - name_to_features = { - "input_ids": - tf.FixedLenFeature([max_seq_length], tf.int64), - "input_mask": - tf.FixedLenFeature([max_seq_length], tf.int64), - "segment_ids": - tf.FixedLenFeature([max_seq_length], tf.int64), - "masked_lm_positions": - tf.FixedLenFeature([max_predictions_per_seq], tf.int64), - "masked_lm_ids": - tf.FixedLenFeature([max_predictions_per_seq], tf.int64), - "masked_lm_weights": - tf.FixedLenFeature([max_predictions_per_seq], tf.float32), - "next_sentence_labels": - tf.FixedLenFeature([1], tf.int64), - } + name_to_features = { + "input_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": + tf.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "masked_lm_positions": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_ids": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_weights": + tf.FixedLenFeature([max_predictions_per_seq], tf.float32), + "next_sentence_labels": + tf.FixedLenFeature([1], tf.int64), + } - # For training, we want a lot of parallel reading and shuffling. - # For eval, we want no shuffling and parallel reading doesn't matter. - if is_training: - d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) - d = d.repeat() - d = d.shuffle(buffer_size=len(input_files)) + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) + d = d.repeat() + d = d.shuffle(buffer_size=len(input_files)) - # `cycle_length` is the number of parallel files that get read. - cycle_length = min(num_cpu_threads, len(input_files)) + # `cycle_length` is the number of parallel files that get read. + cycle_length = min(num_cpu_threads, len(input_files)) - # `sloppy` mode means that the interleaving is not exact. This adds - # even more randomness to the training pipeline. - d = d.apply( - tf.contrib.data.parallel_interleave( - tf.data.TFRecordDataset, - sloppy=is_training, - cycle_length=cycle_length)) - d = d.shuffle(buffer_size=100) - else: - d = tf.data.TFRecordDataset(input_files) - # Since we evaluate for a fixed number of steps we don't want to encounter - # out-of-range exceptions. - d = d.repeat() + # `sloppy` mode means that the interleaving is not exact. This adds + # even more randomness to the training pipeline. + d = d.apply( + tf.contrib.data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=is_training, + cycle_length=cycle_length)) + d = d.shuffle(buffer_size=100) + else: + d = tf.data.TFRecordDataset(input_files) + # Since we evaluate for a fixed number of steps we don't want to encounter + # out-of-range exceptions. + d = d.repeat() - # We must `drop_remainder` on training because the TPU requires fixed - # size dimensions. For eval, we assume we are evaling on the CPU or GPU - # and we *don"t* want to drop the remainder, otherwise we wont cover - # every sample. - d = d.apply( - tf.contrib.data.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - num_parallel_batches=num_cpu_threads, - drop_remainder=True)) - return d + # We must `drop_remainder` on training because the TPU requires fixed + # size dimensions. For eval, we assume we are evaling on the CPU or GPU + # and we *don"t* want to drop the remainder, otherwise we wont cover + # every sample. + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + num_parallel_batches=num_cpu_threads, + drop_remainder=True)) + return d - return input_fn + return input_fn def _decode_record(record, name_to_features): - """Decodes a record to a TensorFlow example.""" - example = tf.parse_single_example(record, name_to_features) + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) - # tf.Example only supports tf.int64, but the TPU only supports tf.int32. - # So cast all int64 to int32. - for name in list(example.keys()): - t = example[name] - if t.dtype == tf.int64: - t = tf.to_int32(t) - example[name] = t + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t - return example + return example def main(_): - tf.logging.set_verbosity(tf.logging.INFO) + tf.logging.set_verbosity(tf.logging.INFO) - if not FLAGS.do_train and not FLAGS.do_eval: - raise ValueError("At least one of `do_train` or `do_eval` must be True.") + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") - bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) - tf.gfile.MakeDirs(FLAGS.output_dir) + tf.gfile.MakeDirs(FLAGS.output_dir) - input_files = [] - for input_pattern in FLAGS.input_file.split(","): - input_files.extend(tf.gfile.Glob(input_pattern)) + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) - tf.logging.info("*** Input Files ***") - for input_file in input_files: - tf.logging.info(" %s" % input_file) + tf.logging.info("*** Input Files ***") + for input_file in input_files: + tf.logging.info(" %s" % input_file) - tpu_cluster_resolver = None - if FLAGS.use_tpu and FLAGS.tpu_name: - tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( - FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) - is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 - run_config = tf.contrib.tpu.RunConfig( - cluster=tpu_cluster_resolver, - master=FLAGS.master, - model_dir=FLAGS.output_dir, - save_checkpoints_steps=FLAGS.save_checkpoints_steps, - tpu_config=tf.contrib.tpu.TPUConfig( - iterations_per_loop=FLAGS.iterations_per_loop, - num_shards=FLAGS.num_tpu_cores, - per_host_input_for_training=is_per_host)) + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) - model_fn = model_fn_builder( - bert_config=bert_config, - init_checkpoint=FLAGS.init_checkpoint, - learning_rate=FLAGS.learning_rate, - num_train_steps=FLAGS.num_train_steps, - num_warmup_steps=FLAGS.num_warmup_steps, - use_tpu=FLAGS.use_tpu, - use_one_hot_embeddings=FLAGS.use_tpu) + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.num_train_steps, + num_warmup_steps=FLAGS.num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) - # If TPU is not available, this will fall back to normal Estimator on CPU - # or GPU. - estimator = tf.contrib.tpu.TPUEstimator( - use_tpu=FLAGS.use_tpu, - model_fn=model_fn, - config=run_config, - train_batch_size=FLAGS.train_batch_size, - eval_batch_size=FLAGS.eval_batch_size) + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size) - if FLAGS.do_train: - tf.logging.info("***** Running training *****") - tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) - train_input_fn = input_fn_builder( - input_files=input_files, - max_seq_length=FLAGS.max_seq_length, - max_predictions_per_seq=FLAGS.max_predictions_per_seq, - is_training=True) - estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) + if FLAGS.do_train: + tf.logging.info("***** Running training *****") + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + train_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=True) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) - if FLAGS.do_eval: - tf.logging.info("***** Running evaluation *****") - tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + if FLAGS.do_eval: + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) - eval_input_fn = input_fn_builder( - input_files=input_files, - max_seq_length=FLAGS.max_seq_length, - max_predictions_per_seq=FLAGS.max_predictions_per_seq, - is_training=False) + eval_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=False) - result = estimator.evaluate( - input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) + result = estimator.evaluate( + input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) - output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") - with tf.gfile.GFile(output_eval_file, "w") as writer: - tf.logging.info("***** Eval results *****") - for key in sorted(result.keys()): - tf.logging.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) if __name__ == "__main__": - flags.mark_flag_as_required("input_file") - flags.mark_flag_as_required("bert_config_file") - flags.mark_flag_as_required("output_dir") - tf.app.run() + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/run_squad.py b/run_squad.py index c6677966f0..642a18787d 100644 --- a/run_squad.py +++ b/run_squad.py @@ -146,562 +146,562 @@ flags.DEFINE_bool( class SquadExample(object): - """A single training/test example for simple sequence classification.""" + """A single training/test example for simple sequence classification.""" - def __init__(self, - qas_id, - question_text, - doc_tokens, - orig_answer_text=None, - start_position=None, - end_position=None): - self.qas_id = qas_id - self.question_text = question_text - self.doc_tokens = doc_tokens - self.orig_answer_text = orig_answer_text - self.start_position = start_position - self.end_position = end_position + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None): + self.qas_id = qas_id + self.question_text = question_text + self.doc_tokens = doc_tokens + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position - def __str__(self): - return self.__repr__() + def __str__(self): + return self.__repr__() - def __repr__(self): - s = "" - s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) - s += ", question_text: %s" % ( - tokenization.printable_text(self.question_text)) - s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) - if self.start_position: - s += ", start_position: %d" % (self.start_position) - if self.start_position: - s += ", end_position: %d" % (self.end_position) - return s + def __repr__(self): + s = "" + s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) + s += ", question_text: %s" % ( + tokenization.printable_text(self.question_text)) + s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", end_position: %d" % (self.end_position) + return s class InputFeatures(object): - """A single set of features of data.""" + """A single set of features of data.""" - def __init__(self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - start_position=None, - end_position=None): - self.unique_id = unique_id - self.example_index = example_index - self.doc_span_index = doc_span_index - self.tokens = tokens - self.token_to_orig_map = token_to_orig_map - self.token_is_max_context = token_is_max_context - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.start_position = start_position - self.end_position = end_position + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + start_position=None, + end_position=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.start_position = start_position + self.end_position = end_position def read_squad_examples(input_file, is_training): - """Read a SQuAD json file into a list of SquadExample.""" - with tf.gfile.Open(input_file, "r") as reader: - input_data = json.load(reader)["data"] + """Read a SQuAD json file into a list of SquadExample.""" + with tf.gfile.Open(input_file, "r") as reader: + input_data = json.load(reader)["data"] - def is_whitespace(c): - if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: - return True - return False + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False - examples = [] - for entry in input_data: - for paragraph in entry["paragraphs"]: - paragraph_text = paragraph["context"] - doc_tokens = [] - char_to_word_offset = [] - prev_is_whitespace = True - for c in paragraph_text: - if is_whitespace(c): - prev_is_whitespace = True - else: - if prev_is_whitespace: - doc_tokens.append(c) - else: - doc_tokens[-1] += c - prev_is_whitespace = False - char_to_word_offset.append(len(doc_tokens) - 1) + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) - for qa in paragraph["qas"]: - qas_id = qa["id"] - question_text = qa["question"] - start_position = None - end_position = None - orig_answer_text = None - if is_training: - if len(qa["answers"]) != 1: - raise ValueError( - "For training, each question should have exactly 1 answer.") - answer = qa["answers"][0] - orig_answer_text = answer["text"] - answer_offset = answer["answer_start"] - answer_length = len(orig_answer_text) - start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[answer_offset + answer_length - 1] - # Only add answers where the text can be exactly recovered from the - # document. If this CAN'T happen it's likely due to weird Unicode - # stuff so we will just skip the example. - # - # Note that this means for training mode, every example is NOT - # guaranteed to be preserved. - actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) - cleaned_answer_text = " ".join( - tokenization.whitespace_tokenize(orig_answer_text)) - if actual_text.find(cleaned_answer_text) == -1: - tf.logging.warning("Could not find answer: '%s' vs. '%s'", - actual_text, cleaned_answer_text) - continue + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + if is_training: + if len(qa["answers"]) != 1: + raise ValueError( + "For training, each question should have exactly 1 answer.") + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[answer_offset + answer_length - 1] + # Only add answers where the text can be exactly recovered from the + # document. If this CAN'T happen it's likely due to weird Unicode + # stuff so we will just skip the example. + # + # Note that this means for training mode, every example is NOT + # guaranteed to be preserved. + actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join( + tokenization.whitespace_tokenize(orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + tf.logging.warning("Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue - example = SquadExample( - qas_id=qas_id, - question_text=question_text, - doc_tokens=doc_tokens, - orig_answer_text=orig_answer_text, - start_position=start_position, - end_position=end_position) - examples.append(example) - return examples + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position) + examples.append(example) + return examples def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training): - """Loads a data file into a list of `InputBatch`s.""" + """Loads a data file into a list of `InputBatch`s.""" - unique_id = 1000000000 + unique_id = 1000000000 - features = [] - for (example_index, example) in enumerate(examples): - query_tokens = tokenizer.tokenize(example.question_text) + features = [] + for (example_index, example) in enumerate(examples): + query_tokens = tokenizer.tokenize(example.question_text) - if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0:max_query_length] + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] - tok_to_orig_index = [] - orig_to_tok_index = [] - all_doc_tokens = [] - for (i, token) in enumerate(example.doc_tokens): - orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token) - for sub_token in sub_tokens: - tok_to_orig_index.append(i) - all_doc_tokens.append(sub_token) + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) - tok_start_position = None - tok_end_position = None - if is_training: - tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(example.doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 - else: - tok_end_position = len(all_doc_tokens) - 1 - (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text) - - # The -3 accounts for [CLS], [SEP] and [SEP] - max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - - # We can have documents that are longer than the maximum sequence length. - # To deal with this we do a sliding window approach, where we take chunks - # of the up to our max length with a stride of `doc_stride`. - _DocSpan = collections.namedtuple( # pylint: disable=invalid-name - "DocSpan", ["start", "length"]) - doc_spans = [] - start_offset = 0 - while start_offset < len(all_doc_tokens): - length = len(all_doc_tokens) - start_offset - if length > max_tokens_for_doc: - length = max_tokens_for_doc - doc_spans.append(_DocSpan(start=start_offset, length=length)) - if start_offset + length == len(all_doc_tokens): - break - start_offset += min(length, doc_stride) - - for (doc_span_index, doc_span) in enumerate(doc_spans): - tokens = [] - token_to_orig_map = {} - token_is_max_context = {} - segment_ids = [] - tokens.append("[CLS]") - segment_ids.append(0) - for token in query_tokens: - tokens.append(token) - segment_ids.append(0) - tokens.append("[SEP]") - segment_ids.append(0) - - for i in range(doc_span.length): - split_token_index = doc_span.start + i - token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - - is_max_context = _check_is_max_context(doc_spans, doc_span_index, - split_token_index) - token_is_max_context[len(tokens)] = is_max_context - tokens.append(all_doc_tokens[split_token_index]) - segment_ids.append(1) - tokens.append("[SEP]") - segment_ids.append(1) - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1] * len(input_ids) - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(0) - input_mask.append(0) - segment_ids.append(0) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - start_position = None - end_position = None - if is_training: - # For training, if our document chunk does not contain an annotation - # we throw it out, since there is nothing to predict. - doc_start = doc_span.start - doc_end = doc_span.start + doc_span.length - 1 - if (example.start_position < doc_start or - example.end_position < doc_start or - example.start_position > doc_end or example.end_position > doc_end): - continue - - doc_offset = len(query_tokens) + 2 - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - - if example_index < 20: - tf.logging.info("*** Example ***") - tf.logging.info("unique_id: %s" % (unique_id)) - tf.logging.info("example_index: %s" % (example_index)) - tf.logging.info("doc_span_index: %s" % (doc_span_index)) - tf.logging.info("tokens: %s" % " ".join( - [tokenization.printable_text(x) for x in tokens])) - tf.logging.info("token_to_orig_map: %s" % " ".join( - ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) - tf.logging.info("token_is_max_context: %s" % " ".join([ - "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) - ])) - tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - tf.logging.info( - "input_mask: %s" % " ".join([str(x) for x in input_mask])) - tf.logging.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tok_start_position = None + tok_end_position = None if is_training: - answer_text = " ".join(tokens[start_position:(end_position + 1)]) - tf.logging.info("start_position: %d" % (start_position)) - tf.logging.info("end_position: %d" % (end_position)) - tf.logging.info( - "answer: %s" % (tokenization.printable_text(answer_text))) + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example.orig_answer_text) - features.append( - InputFeatures( - unique_id=unique_id, - example_index=example_index, - doc_span_index=doc_span_index, - tokens=tokens, - token_to_orig_map=token_to_orig_map, - token_is_max_context=token_is_max_context, - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - start_position=start_position, - end_position=end_position)) - unique_id += 1 + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - return features + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + start_position = None + end_position = None + if is_training: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + if (example.start_position < doc_start or + example.end_position < doc_start or + example.start_position > doc_end or example.end_position > doc_end): + continue + + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if example_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (unique_id)) + tf.logging.info("example_index: %s" % (example_index)) + tf.logging.info("doc_span_index: %s" % (doc_span_index)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("token_to_orig_map: %s" % " ".join( + ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) + tf.logging.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) + ])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if is_training: + answer_text = " ".join(tokens[start_position:(end_position + 1)]) + tf.logging.info("start_position: %d" % (start_position)) + tf.logging.info("end_position: %d" % (end_position)) + tf.logging.info( + "answer: %s" % (tokenization.printable_text(answer_text))) + + features.append( + InputFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + start_position=start_position, + end_position=end_position)) + unique_id += 1 + + return features def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): - """Returns tokenized answer spans that better match the annotated answer.""" + """Returns tokenized answer spans that better match the annotated answer.""" - # The SQuAD annotations are character based. We first project them to - # whitespace-tokenized words. But then after WordPiece tokenization, we can - # often find a "better match". For example: - # - # Question: What year was John Smith born? - # Context: The leader was John Smith (1895-1943). - # Answer: 1895 - # - # The original whitespace-tokenized answer will be "(1895-1943).". However - # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match - # the exact answer, 1895. - # - # However, this is not always possible. Consider the following: - # - # Question: What country is the top exporter of electornics? - # Context: The Japanese electronics industry is the lagest in the world. - # Answer: Japan - # - # In this case, the annotator chose "Japan" as a character sub-span of - # the word "Japanese". Since our WordPiece tokenizer does not split - # "Japanese", we just use "Japanese" as the annotation. This is fairly rare - # in SQuAD, but does happen. - tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) - for new_start in range(input_start, input_end + 1): - for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) - if text_span == tok_answer_text: - return (new_start, new_end) + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) - return (input_start, input_end) + return (input_start, input_end) def _check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token.""" + """Check if this is the 'max context' doc span for the token.""" - # Because of the sliding window approach taken to scoring documents, a single - # token can appear in multiple documents. E.g. - # Doc: the man went to the store and bought a gallon of milk - # Span A: the man went to the - # Span B: to the store and bought - # Span C: and bought a gallon of - # ... - # - # Now the word 'bought' will have two scores from spans B and C. We only - # want to consider the score with "maximum context", which we define as - # the *minimum* of its left and right context (the *sum* of left and - # right context will always be the same, of course). - # - # In the example the maximum context for 'bought' would be span C since - # it has 1 left context and 3 right context, while span B has 4 left context - # and 0 right context. - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index - return cur_span_index == best_span_index + return cur_span_index == best_span_index def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): - """Creates a classification model.""" - model = modeling.BertModel( - config=bert_config, - is_training=is_training, - input_ids=input_ids, - input_mask=input_mask, - token_type_ids=segment_ids, - use_one_hot_embeddings=use_one_hot_embeddings) + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) - final_hidden = model.get_sequence_output() + final_hidden = model.get_sequence_output() - final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) - batch_size = final_hidden_shape[0] - seq_length = final_hidden_shape[1] - hidden_size = final_hidden_shape[2] + final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) + batch_size = final_hidden_shape[0] + seq_length = final_hidden_shape[1] + hidden_size = final_hidden_shape[2] - output_weights = tf.get_variable( - "cls/squad/output_weights", [2, hidden_size], - initializer=tf.truncated_normal_initializer(stddev=0.02)) + output_weights = tf.get_variable( + "cls/squad/output_weights", [2, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) - output_bias = tf.get_variable( - "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) + output_bias = tf.get_variable( + "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) - final_hidden_matrix = tf.reshape(final_hidden, - [batch_size * seq_length, hidden_size]) - logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) - logits = tf.nn.bias_add(logits, output_bias) + final_hidden_matrix = tf.reshape(final_hidden, + [batch_size * seq_length, hidden_size]) + logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) - logits = tf.reshape(logits, [batch_size, seq_length, 2]) - logits = tf.transpose(logits, [2, 0, 1]) + logits = tf.reshape(logits, [batch_size, seq_length, 2]) + logits = tf.transpose(logits, [2, 0, 1]) - unstacked_logits = tf.unstack(logits, axis=0) + unstacked_logits = tf.unstack(logits, axis=0) - (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) + (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) - return (start_logits, end_logits) + return (start_logits, end_logits) def model_fn_builder(bert_config, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings): - """Returns `model_fn` closure for TPUEstimator.""" + """Returns `model_fn` closure for TPUEstimator.""" - def model_fn(features, labels, mode, params): # pylint: disable=unused-argument - """The `model_fn` for TPUEstimator.""" + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" - tf.logging.info("*** Features ***") - for name in sorted(features.keys()): - tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) - unique_ids = features["unique_ids"] - input_ids = features["input_ids"] - input_mask = features["input_mask"] - segment_ids = features["segment_ids"] + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] - is_training = (mode == tf.estimator.ModeKeys.TRAIN) + is_training = (mode == tf.estimator.ModeKeys.TRAIN) - (start_logits, end_logits) = create_model( - bert_config=bert_config, - is_training=is_training, - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - use_one_hot_embeddings=use_one_hot_embeddings) + (start_logits, end_logits) = create_model( + bert_config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) - tvars = tf.trainable_variables() + tvars = tf.trainable_variables() - initialized_variable_names = {} - scaffold_fn = None - if init_checkpoint: - (assignment_map, - initialized_variable_names) = modeling.get_assigment_map_from_checkpoint( - tvars, init_checkpoint) - if use_tpu: + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, + initialized_variable_names) = modeling.get_assigment_map_from_checkpoint( + tvars, init_checkpoint) + if use_tpu: - def tpu_scaffold(): - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - return tf.train.Scaffold() + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() - scaffold_fn = tpu_scaffold - else: - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - tf.logging.info("**** Trainable Variables ****") - for var in tvars: - init_string = "" - if var.name in initialized_variable_names: - init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, - init_string) + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) - output_spec = None - if mode == tf.estimator.ModeKeys.TRAIN: - seq_length = modeling.get_shape_list(input_ids)[1] + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + seq_length = modeling.get_shape_list(input_ids)[1] - def compute_loss(logits, positions): - one_hot_positions = tf.one_hot( - positions, depth=seq_length, dtype=tf.float32) - log_probs = tf.nn.log_softmax(logits, axis=-1) - loss = -tf.reduce_mean( - tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) - return loss + def compute_loss(logits, positions): + one_hot_positions = tf.one_hot( + positions, depth=seq_length, dtype=tf.float32) + log_probs = tf.nn.log_softmax(logits, axis=-1) + loss = -tf.reduce_mean( + tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) + return loss - start_positions = features["start_positions"] - end_positions = features["end_positions"] + start_positions = features["start_positions"] + end_positions = features["end_positions"] - start_loss = compute_loss(start_logits, start_positions) - end_loss = compute_loss(end_logits, end_positions) + start_loss = compute_loss(start_logits, start_positions) + end_loss = compute_loss(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2.0 + total_loss = (start_loss + end_loss) / 2.0 - train_op = optimization.create_optimizer( - total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) - output_spec = tf.contrib.tpu.TPUEstimatorSpec( - mode=mode, - loss=total_loss, - train_op=train_op, - scaffold_fn=scaffold_fn) - elif mode == tf.estimator.ModeKeys.PREDICT: - predictions = { - "unique_ids": unique_ids, - "start_logits": start_logits, - "end_logits": end_logits, - } - output_spec = tf.contrib.tpu.TPUEstimatorSpec( - mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) - else: - raise ValueError( - "Only TRAIN and PREDICT modes are supported: %s" % (mode)) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.PREDICT: + predictions = { + "unique_ids": unique_ids, + "start_logits": start_logits, + "end_logits": end_logits, + } + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + else: + raise ValueError( + "Only TRAIN and PREDICT modes are supported: %s" % (mode)) - return output_spec + return output_spec - return model_fn + return model_fn def input_fn_builder(features, seq_length, is_training, drop_remainder): - """Creates an `input_fn` closure to be passed to TPUEstimator.""" + """Creates an `input_fn` closure to be passed to TPUEstimator.""" - all_unique_ids = [] - all_input_ids = [] - all_input_mask = [] - all_segment_ids = [] - all_start_positions = [] - all_end_positions = [] + all_unique_ids = [] + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_start_positions = [] + all_end_positions = [] - for feature in features: - all_unique_ids.append(feature.unique_id) - all_input_ids.append(feature.input_ids) - all_input_mask.append(feature.input_mask) - all_segment_ids.append(feature.segment_ids) - if is_training: - all_start_positions.append(feature.start_position) - all_end_positions.append(feature.end_position) + for feature in features: + all_unique_ids.append(feature.unique_id) + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + if is_training: + all_start_positions.append(feature.start_position) + all_end_positions.append(feature.end_position) - def input_fn(params): - """The actual input function.""" - batch_size = params["batch_size"] + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] - num_examples = len(features) + num_examples = len(features) - # This is for demo purposes and does NOT scale to large data sets. We do - # not use Dataset.from_generator() because that uses tf.py_func which is - # not TPU compatible. The right way to load data is with TFRecordReader. - feature_map = { - "unique_ids": - tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), - "input_ids": - tf.constant( - all_input_ids, shape=[num_examples, seq_length], - dtype=tf.int32), - "input_mask": - tf.constant( - all_input_mask, - shape=[num_examples, seq_length], - dtype=tf.int32), - "segment_ids": - tf.constant( - all_segment_ids, - shape=[num_examples, seq_length], - dtype=tf.int32), - } - if is_training: - feature_map["start_positions"] = tf.constant( - all_start_positions, shape=[num_examples], dtype=tf.int32) - feature_map["end_positions"] = tf.constant( - all_end_positions, shape=[num_examples], dtype=tf.int32) + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + feature_map = { + "unique_ids": + tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + } + if is_training: + feature_map["start_positions"] = tf.constant( + all_start_positions, shape=[num_examples], dtype=tf.int32) + feature_map["end_positions"] = tf.constant( + all_end_positions, shape=[num_examples], dtype=tf.int32) - d = tf.data.Dataset.from_tensor_slices(feature_map) + d = tf.data.Dataset.from_tensor_slices(feature_map) - if is_training: - d = d.repeat() - d = d.shuffle(buffer_size=100) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) - d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) - return d + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d - return input_fn + return input_fn RawResult = collections.namedtuple("RawResult", @@ -711,410 +711,410 @@ RawResult = collections.namedtuple("RawResult", def write_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file): - """Write final predictions to the json file.""" - tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) - tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) + """Write final predictions to the json file.""" + tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) + tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) - example_index_to_features = collections.defaultdict(list) - for feature in all_features: - example_index_to_features[feature.example_index].append(feature) + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) - unique_id_to_result = {} - for result in all_results: - unique_id_to_result[result.unique_id] = result + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result - _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name - "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) - all_predictions = collections.OrderedDict() - all_nbest_json = collections.OrderedDict() - for (example_index, example) in enumerate(all_examples): - features = example_index_to_features[example_index] + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] - prelim_predictions = [] - for (feature_index, feature) in enumerate(features): - result = unique_id_to_result[feature.unique_id] + prelim_predictions = [] + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] - start_indexes = _get_best_indexes(result.start_logits, n_best_size) - end_indexes = _get_best_indexes(result.end_logits, n_best_size) - for start_index in start_indexes: - for end_index in end_indexes: - # We could hypothetically create invalid predictions, e.g., predict - # that the start of the span is in the question. We throw out all - # invalid predictions. - if start_index >= len(feature.tokens): - continue - if end_index >= len(feature.tokens): - continue - if start_index not in feature.token_to_orig_map: - continue - if end_index not in feature.token_to_orig_map: - continue - if not feature.token_is_max_context.get(start_index, False): - continue - if end_index < start_index: - continue - length = end_index - start_index + 1 - if length > max_answer_length: - continue - prelim_predictions.append( - _PrelimPrediction( - feature_index=feature_index, - start_index=start_index, - end_index=end_index, - start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index])) + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) - prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_logit + x.end_logit), - reverse=True) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) - _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"]) + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) - seen_predictions = {} - nbest = [] - for pred in prelim_predictions: - if len(nbest) >= n_best_size: - break - feature = features[pred.feature_index] + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] - orig_doc_start = feature.token_to_orig_map[pred.start_index] - orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] - tok_text = " ".join(tok_tokens) + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) - # De-tokenize WordPieces that have been split off. - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - orig_text = " ".join(orig_tokens) + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, do_lower_case) - if final_text in seen_predictions: - continue + final_text = get_final_text(tok_text, orig_text, do_lower_case) + if final_text in seen_predictions: + continue - seen_predictions[final_text] = True - nbest.append( - _NbestPrediction( - text=final_text, - start_logit=pred.start_logit, - end_logit=pred.end_logit)) + seen_predictions[final_text] = True + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) - # In very rare edge cases we could have no valid predictions. So we - # just create a nonce prediction in this case to avoid failure. - if not nbest: - nbest.append( - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) - assert len(nbest) >= 1 + assert len(nbest) >= 1 - total_scores = [] - for entry in nbest: - total_scores.append(entry.start_logit + entry.end_logit) + total_scores = [] + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) - probs = _compute_softmax(total_scores) + probs = _compute_softmax(total_scores) - nbest_json = [] - for (i, entry) in enumerate(nbest): - output = collections.OrderedDict() - output["text"] = entry.text - output["probability"] = probs[i] - output["start_logit"] = entry.start_logit - output["end_logit"] = entry.end_logit - nbest_json.append(output) + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) - assert len(nbest_json) >= 1 + assert len(nbest_json) >= 1 - all_predictions[example.qas_id] = nbest_json[0]["text"] - all_nbest_json[example.qas_id] = nbest_json + all_predictions[example.qas_id] = nbest_json[0]["text"] + all_nbest_json[example.qas_id] = nbest_json - with tf.gfile.GFile(output_prediction_file, "w") as writer: - writer.write(json.dumps(all_predictions, indent=4) + "\n") + with tf.gfile.GFile(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") - with tf.gfile.GFile(output_nbest_file, "w") as writer: - writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + with tf.gfile.GFile(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") def get_final_text(pred_text, orig_text, do_lower_case): - """Project the tokenized prediction back to the original text.""" + """Project the tokenized prediction back to the original text.""" - # When we created the data, we kept track of the alignment between original - # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So - # now `orig_text` contains the span of our original text corresponding to the - # span that we predicted. - # - # However, `orig_text` may contain extra characters that we don't want in - # our prediction. - # - # For example, let's say: - # pred_text = steve smith - # orig_text = Steve Smith's - # - # We don't want to return `orig_text` because it contains the extra "'s". - # - # We don't want to return `pred_text` because it's already been normalized - # (the SQuAD eval script also does punctuation stripping/lower casing but - # our tokenizer does additional normalization like stripping accent - # characters). - # - # What we really want to return is "Steve Smith". - # - # Therefore, we have to apply a semi-complicated alignment heruistic between - # `pred_text` and `orig_text` to get a character-to-charcter alignment. This - # can fail in certain cases in which case we just return `orig_text`. + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heruistic between + # `pred_text` and `orig_text` to get a character-to-charcter alignment. This + # can fail in certain cases in which case we just return `orig_text`. - def _strip_spaces(text): - ns_chars = [] - ns_to_s_map = collections.OrderedDict() - for (i, c) in enumerate(text): - if c == " ": - continue - ns_to_s_map[len(ns_chars)] = i - ns_chars.append(c) - ns_text = "".join(ns_chars) - return (ns_text, ns_to_s_map) + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) - # We first tokenize `orig_text`, strip whitespace from the result - # and `pred_text`, and check if they are the same length. If they are - # NOT the same length, the heuristic has failed. If they are the same - # length, we assume the characters are one-to-one aligned. - tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) - tok_text = " ".join(tokenizer.tokenize(orig_text)) + tok_text = " ".join(tokenizer.tokenize(orig_text)) - start_position = tok_text.find(pred_text) - if start_position == -1: - if FLAGS.verbose_logging: - tf.logging.info( - "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) - return orig_text - end_position = start_position + len(pred_text) - 1 + start_position = tok_text.find(pred_text) + if start_position == -1: + if FLAGS.verbose_logging: + tf.logging.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 - (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) - (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) - if len(orig_ns_text) != len(tok_ns_text): - if FLAGS.verbose_logging: - tf.logging.info("Length not equal after stripping spaces: '%s' vs '%s'", - orig_ns_text, tok_ns_text) - return orig_text + if len(orig_ns_text) != len(tok_ns_text): + if FLAGS.verbose_logging: + tf.logging.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text - # We then project the characters in `pred_text` back to `orig_text` using - # the character-to-character alignment. - tok_s_to_ns_map = {} - for (i, tok_index) in six.iteritems(tok_ns_to_s_map): - tok_s_to_ns_map[tok_index] = i + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in six.iteritems(tok_ns_to_s_map): + tok_s_to_ns_map[tok_index] = i - orig_start_position = None - if start_position in tok_s_to_ns_map: - ns_start_position = tok_s_to_ns_map[start_position] - if ns_start_position in orig_ns_to_s_map: - orig_start_position = orig_ns_to_s_map[ns_start_position] + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] - if orig_start_position is None: - if FLAGS.verbose_logging: - tf.logging.info("Couldn't map start position") - return orig_text + if orig_start_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map start position") + return orig_text - orig_end_position = None - if end_position in tok_s_to_ns_map: - ns_end_position = tok_s_to_ns_map[end_position] - if ns_end_position in orig_ns_to_s_map: - orig_end_position = orig_ns_to_s_map[ns_end_position] + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] - if orig_end_position is None: - if FLAGS.verbose_logging: - tf.logging.info("Couldn't map end position") - return orig_text + if orig_end_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map end position") + return orig_text - output_text = orig_text[orig_start_position:(orig_end_position + 1)] - return output_text + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text def _get_best_indexes(logits, n_best_size): - """Get the n-best logits from a list.""" - index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) - best_indexes = [] - for i in range(len(index_and_score)): - if i >= n_best_size: - break - best_indexes.append(index_and_score[i][0]) - return best_indexes + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes def _compute_softmax(scores): - """Compute softmax probability over raw logits.""" - if not scores: - return [] + """Compute softmax probability over raw logits.""" + if not scores: + return [] - max_score = None - for score in scores: - if max_score is None or score > max_score: - max_score = score + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score - exp_scores = [] - total_sum = 0.0 - for score in scores: - x = math.exp(score - max_score) - exp_scores.append(x) - total_sum += x + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x - probs = [] - for score in exp_scores: - probs.append(score / total_sum) - return probs + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs def main(_): - tf.logging.set_verbosity(tf.logging.INFO) + tf.logging.set_verbosity(tf.logging.INFO) - if not FLAGS.do_train and not FLAGS.do_predict: - raise ValueError("At least one of `do_train` or `do_predict` must be True.") + if not FLAGS.do_train and not FLAGS.do_predict: + raise ValueError("At least one of `do_train` or `do_predict` must be True.") - if FLAGS.do_train: - if not FLAGS.train_file: - raise ValueError( - "If `do_train` is True, then `train_file` must be specified.") - if FLAGS.do_predict: - if not FLAGS.predict_file: - raise ValueError( - "If `do_predict` is True, then `predict_file` must be specified.") + if FLAGS.do_train: + if not FLAGS.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if FLAGS.do_predict: + if not FLAGS.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") - bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) - if FLAGS.max_seq_length > bert_config.max_position_embeddings: - raise ValueError( - "Cannot use sequence length %d because the BERT model " - "was only trained up to sequence length %d" % - (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) - tf.gfile.MakeDirs(FLAGS.output_dir) + tf.gfile.MakeDirs(FLAGS.output_dir) - tokenizer = tokenization.FullTokenizer( - vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) - tpu_cluster_resolver = None - if FLAGS.use_tpu and FLAGS.tpu_name: - tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( - FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) - is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 - run_config = tf.contrib.tpu.RunConfig( - cluster=tpu_cluster_resolver, - master=FLAGS.master, - model_dir=FLAGS.output_dir, - save_checkpoints_steps=FLAGS.save_checkpoints_steps, - tpu_config=tf.contrib.tpu.TPUConfig( - iterations_per_loop=FLAGS.iterations_per_loop, - num_shards=FLAGS.num_tpu_cores, - per_host_input_for_training=is_per_host)) + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) - train_examples = None - num_train_steps = None - num_warmup_steps = None - if FLAGS.do_train: - train_examples = read_squad_examples( - input_file=FLAGS.train_file, is_training=True) - num_train_steps = int( - len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) - num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = read_squad_examples( + input_file=FLAGS.train_file, is_training=True) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) - model_fn = model_fn_builder( - bert_config=bert_config, - init_checkpoint=FLAGS.init_checkpoint, - learning_rate=FLAGS.learning_rate, - num_train_steps=num_train_steps, - num_warmup_steps=num_warmup_steps, - use_tpu=FLAGS.use_tpu, - use_one_hot_embeddings=FLAGS.use_tpu) + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) - # If TPU is not available, this will fall back to normal Estimator on CPU - # or GPU. - estimator = tf.contrib.tpu.TPUEstimator( - use_tpu=FLAGS.use_tpu, - model_fn=model_fn, - config=run_config, - train_batch_size=FLAGS.train_batch_size, - predict_batch_size=FLAGS.predict_batch_size) + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size) - if FLAGS.do_train: - train_features = convert_examples_to_features( - examples=train_examples, - tokenizer=tokenizer, - max_seq_length=FLAGS.max_seq_length, - doc_stride=FLAGS.doc_stride, - max_query_length=FLAGS.max_query_length, - is_training=True) - tf.logging.info("***** Running training *****") - tf.logging.info(" Num orig examples = %d", len(train_examples)) - tf.logging.info(" Num split examples = %d", len(train_features)) - tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) - tf.logging.info(" Num steps = %d", num_train_steps) - train_input_fn = input_fn_builder( - features=train_features, - seq_length=FLAGS.max_seq_length, - is_training=True, - drop_remainder=True) - estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + if FLAGS.do_train: + train_features = convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=True) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num orig examples = %d", len(train_examples)) + tf.logging.info(" Num split examples = %d", len(train_features)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = input_fn_builder( + features=train_features, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) - if FLAGS.do_predict: - eval_examples = read_squad_examples( - input_file=FLAGS.predict_file, is_training=False) - eval_features = convert_examples_to_features( - examples=eval_examples, - tokenizer=tokenizer, - max_seq_length=FLAGS.max_seq_length, - doc_stride=FLAGS.doc_stride, - max_query_length=FLAGS.max_query_length, - is_training=False) + if FLAGS.do_predict: + eval_examples = read_squad_examples( + input_file=FLAGS.predict_file, is_training=False) + eval_features = convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=False) - tf.logging.info("***** Running predictions *****") - tf.logging.info(" Num orig examples = %d", len(eval_examples)) - tf.logging.info(" Num split examples = %d", len(eval_features)) - tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + tf.logging.info("***** Running predictions *****") + tf.logging.info(" Num orig examples = %d", len(eval_examples)) + tf.logging.info(" Num split examples = %d", len(eval_features)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) - all_results = [] + all_results = [] - predict_input_fn = input_fn_builder( - features=eval_features, - seq_length=FLAGS.max_seq_length, - is_training=False, - drop_remainder=False) + predict_input_fn = input_fn_builder( + features=eval_features, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False) - # If running eval on the TPU, you will need to specify the number of - # steps. - all_results = [] - for result in estimator.predict( - predict_input_fn, yield_single_examples=True): - if len(all_results) % 1000 == 0: - tf.logging.info("Processing example: %d" % (len(all_results))) - unique_id = int(result["unique_ids"]) - start_logits = [float(x) for x in result["start_logits"].flat] - end_logits = [float(x) for x in result["end_logits"].flat] - all_results.append( - RawResult( - unique_id=unique_id, - start_logits=start_logits, - end_logits=end_logits)) + # If running eval on the TPU, you will need to specify the number of + # steps. + all_results = [] + for result in estimator.predict( + predict_input_fn, yield_single_examples=True): + if len(all_results) % 1000 == 0: + tf.logging.info("Processing example: %d" % (len(all_results))) + unique_id = int(result["unique_ids"]) + start_logits = [float(x) for x in result["start_logits"].flat] + end_logits = [float(x) for x in result["end_logits"].flat] + all_results.append( + RawResult( + unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) - output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") - output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") - write_predictions(eval_examples, eval_features, all_results, - FLAGS.n_best_size, FLAGS.max_answer_length, - FLAGS.do_lower_case, output_prediction_file, - output_nbest_file) + output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") + output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") + write_predictions(eval_examples, eval_features, all_results, + FLAGS.n_best_size, FLAGS.max_answer_length, + FLAGS.do_lower_case, output_prediction_file, + output_nbest_file) if __name__ == "__main__": - flags.mark_flag_as_required("vocab_file") - flags.mark_flag_as_required("bert_config_file") - flags.mark_flag_as_required("output_dir") - tf.app.run() + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/tokenization.py b/tokenization.py index 50d0b31cc5..a24ba8d457 100644 --- a/tokenization.py +++ b/tokenization.py @@ -25,268 +25,268 @@ import tensorflow as tf def convert_to_unicode(text): - """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) else: - raise ValueError("Unsupported string type: %s" % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text.decode("utf-8", "ignore") - elif isinstance(text, unicode): - return text - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - else: - raise ValueError("Not running on Python2 or Python 3?") + raise ValueError("Not running on Python2 or Python 3?") def printable_text(text): - """Returns text encoded in a way suitable for print or `tf.logging`.""" + """Returns text encoded in a way suitable for print or `tf.logging`.""" - # These functions want `str` for both Python2 and Python3, but in one case - # it's a Unicode string and in the other it's a byte string. - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) else: - raise ValueError("Unsupported string type: %s" % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text - elif isinstance(text, unicode): - return text.encode("utf-8") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - else: - raise ValueError("Not running on Python2 or Python 3?") + raise ValueError("Not running on Python2 or Python 3?") def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - index = 0 - with tf.gfile.GFile(vocab_file, "r") as reader: - while True: - token = convert_to_unicode(reader.readline()) - if not token: - break - token = token.strip() - vocab[token] = index - index += 1 - return vocab + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with tf.gfile.GFile(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab def convert_tokens_to_ids(vocab, tokens): - """Converts a sequence of tokens into ids using the vocab.""" - ids = [] - for token in tokens: - ids.append(vocab[token]) - return ids + """Converts a sequence of tokens into ids using the vocab.""" + ids = [] + for token in tokens: + ids.append(vocab[token]) + return ids def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a peice of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens + """Runs basic whitespace cleaning and splitting on a peice of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens class FullTokenizer(object): - """Runs end-to-end tokenziation.""" + """Runs end-to-end tokenziation.""" - def __init__(self, vocab_file, do_lower_case=True): - self.vocab = load_vocab(vocab_file) - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - def tokenize(self, text): - split_tokens = [] - for token in self.basic_tokenizer.tokenize(text): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) - return split_tokens + return split_tokens - def convert_tokens_to_ids(self, tokens): - return convert_tokens_to_ids(self.vocab, tokens) + def convert_tokens_to_ids(self, tokens): + return convert_tokens_to_ids(self.vocab, tokens) class BasicTokenizer(object): - """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - def __init__(self, do_lower_case=True): - """Constructs a BasicTokenizer. + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. - Args: - do_lower_case: Whether to lower case the input. - """ - self.do_lower_case = do_lower_case + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case - def tokenize(self, text): - """Tokenizes a piece of text.""" - text = convert_to_unicode(text) - text = self._clean_text(text) - orig_tokens = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if self.do_lower_case: - token = token.lower() - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token)) + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) - output_tokens = whitespace_tokenize(" ".join(split_tokens)) - return output_tokens + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) - def _run_split_on_punc(self, text): - """Splits punctuation on a piece of text.""" - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 - return ["".join(x) for x in output] + return ["".join(x) for x in output] - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xfffd or _is_control(char): - continue - if _is_whitespace(char): - output.append(" ") - else: - output.append(char) - return "".join(output) + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) class WordpieceTokenizer(object): - """Runs WordPiece tokenziation.""" + """Runs WordPiece tokenziation.""" - def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word - def tokenize(self, text): - """Tokenizes a piece of text into its word pieces. + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. - This uses a greedy longest-match-first algorithm to perform tokenization - using the given vocabulary. + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. - For example: - input = "unaffable" - output = ["un", "##aff", "##able"] + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] - Args: - text: A single token or whitespace separated tokens. This should have - already been passed through `BasicTokenizer. + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. - Returns: - A list of wordpiece tokens. - """ + Returns: + A list of wordpiece tokens. + """ - text = convert_to_unicode(text) + text = convert_to_unicode(text) - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - while start < end: - substr = "".join(chars[start:end]) - if start > 0: - substr = "##" + substr - if substr in self.vocab: - cur_substr = substr - break - end -= 1 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - start = end + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end - if is_bad: - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens def _is_whitespace(char): - """Checks whether `chars` is a whitespace character.""" - # \t, \n, and \r are technically contorl characters but we treat them - # as whitespace since they are generally considered as such. - if char == " " or char == "\t" or char == "\n" or char == "\r": - return True - cat = unicodedata.category(char) - if cat == "Zs": - return True - return False + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False def _is_control(char): - """Checks whether `chars` is a control character.""" - # These are technically control characters but we count them as whitespace - # characters. - if char == "\t" or char == "\n" or char == "\r": + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True return False - cat = unicodedata.category(char) - if cat.startswith("C"): - return True - return False def _is_punctuation(char): - """Checks whether `chars` is a punctuation character.""" - cp = ord(char) - # We treat all non-letter/number ASCII as punctuation. - # Characters such as "^", "$", and "`" are not in the Unicode - # Punctuation class but we treat them as punctuation anyways, for - # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): - return True - cat = unicodedata.category(char) - if cat.startswith("P"): - return True - return False + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/tokenization_test.py b/tokenization_test.py index 5665f54fae..9a6e7693a1 100644 --- a/tokenization_test.py +++ b/tokenization_test.py @@ -25,101 +25,101 @@ import tensorflow as tf class TokenizationTest(tf.test.TestCase): - def test_full_tokenizer(self): - vocab_tokens = [ - "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", - "##ing", "," - ] - with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: - vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + def test_full_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing", "," + ] + with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - vocab_file = vocab_writer.name + vocab_file = vocab_writer.name - tokenizer = tokenization.FullTokenizer(vocab_file) - os.unlink(vocab_file) + tokenizer = tokenization.FullTokenizer(vocab_file) + os.unlink(vocab_file) - tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") - self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) + tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") + self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) - self.assertAllEqual( - tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + self.assertAllEqual( + tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) - def test_basic_tokenizer_lower(self): - tokenizer = tokenization.BasicTokenizer(do_lower_case=True) + def test_basic_tokenizer_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=True) - self.assertAllEqual( - tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), - ["hello", "!", "how", "are", "you", "?"]) - self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["hello", "!", "how", "are", "you", "?"]) + self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) - def test_basic_tokenizer_no_lower(self): - tokenizer = tokenization.BasicTokenizer(do_lower_case=False) + def test_basic_tokenizer_no_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=False) - self.assertAllEqual( - tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), - ["HeLLo", "!", "how", "Are", "yoU", "?"]) + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["HeLLo", "!", "how", "Are", "yoU", "?"]) - def test_wordpiece_tokenizer(self): - vocab_tokens = [ - "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", - "##ing" - ] + def test_wordpiece_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] - vocab = {} - for (i, token) in enumerate(vocab_tokens): - vocab[token] = i - tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) - self.assertAllEqual(tokenizer.tokenize(""), []) + self.assertAllEqual(tokenizer.tokenize(""), []) - self.assertAllEqual( - tokenizer.tokenize("unwanted running"), - ["un", "##want", "##ed", "runn", "##ing"]) + self.assertAllEqual( + tokenizer.tokenize("unwanted running"), + ["un", "##want", "##ed", "runn", "##ing"]) - self.assertAllEqual( - tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) + self.assertAllEqual( + tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) - def test_convert_tokens_to_ids(self): - vocab_tokens = [ - "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", - "##ing" - ] + def test_convert_tokens_to_ids(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] - vocab = {} - for (i, token) in enumerate(vocab_tokens): - vocab[token] = i + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i - self.assertAllEqual( - tokenization.convert_tokens_to_ids( - vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9]) + self.assertAllEqual( + tokenization.convert_tokens_to_ids( + vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9]) - def test_is_whitespace(self): - self.assertTrue(tokenization._is_whitespace(u" ")) - self.assertTrue(tokenization._is_whitespace(u"\t")) - self.assertTrue(tokenization._is_whitespace(u"\r")) - self.assertTrue(tokenization._is_whitespace(u"\n")) - self.assertTrue(tokenization._is_whitespace(u"\u00A0")) + def test_is_whitespace(self): + self.assertTrue(tokenization._is_whitespace(u" ")) + self.assertTrue(tokenization._is_whitespace(u"\t")) + self.assertTrue(tokenization._is_whitespace(u"\r")) + self.assertTrue(tokenization._is_whitespace(u"\n")) + self.assertTrue(tokenization._is_whitespace(u"\u00A0")) - self.assertFalse(tokenization._is_whitespace(u"A")) - self.assertFalse(tokenization._is_whitespace(u"-")) + self.assertFalse(tokenization._is_whitespace(u"A")) + self.assertFalse(tokenization._is_whitespace(u"-")) - def test_is_control(self): - self.assertTrue(tokenization._is_control(u"\u0005")) + def test_is_control(self): + self.assertTrue(tokenization._is_control(u"\u0005")) - self.assertFalse(tokenization._is_control(u"A")) - self.assertFalse(tokenization._is_control(u" ")) - self.assertFalse(tokenization._is_control(u"\t")) - self.assertFalse(tokenization._is_control(u"\r")) + self.assertFalse(tokenization._is_control(u"A")) + self.assertFalse(tokenization._is_control(u" ")) + self.assertFalse(tokenization._is_control(u"\t")) + self.assertFalse(tokenization._is_control(u"\r")) - def test_is_punctuation(self): - self.assertTrue(tokenization._is_punctuation(u"-")) - self.assertTrue(tokenization._is_punctuation(u"$")) - self.assertTrue(tokenization._is_punctuation(u"`")) - self.assertTrue(tokenization._is_punctuation(u".")) + def test_is_punctuation(self): + self.assertTrue(tokenization._is_punctuation(u"-")) + self.assertTrue(tokenization._is_punctuation(u"$")) + self.assertTrue(tokenization._is_punctuation(u"`")) + self.assertTrue(tokenization._is_punctuation(u".")) - self.assertFalse(tokenization._is_punctuation(u"A")) - self.assertFalse(tokenization._is_punctuation(u" ")) + self.assertFalse(tokenization._is_punctuation(u"A")) + self.assertFalse(tokenization._is_punctuation(u" ")) if __name__ == "__main__": - tf.test.main() + tf.test.main()