Simplifying code for easier understanding.
This commit is contained in:
@@ -196,9 +196,7 @@ class ColaProcessor(DataProcessor):
|
|||||||
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
|
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
|
||||||
"""Loads a data file into a list of `InputBatch`s."""
|
"""Loads a data file into a list of `InputBatch`s."""
|
||||||
|
|
||||||
label_map = {}
|
label_map = {label : i for i, label in enumerate(label_list)}
|
||||||
for (i, label) in enumerate(label_list):
|
|
||||||
label_map[label] = i
|
|
||||||
|
|
||||||
features = []
|
features = []
|
||||||
for (ex_index, example) in enumerate(examples):
|
for (ex_index, example) in enumerate(examples):
|
||||||
@@ -207,8 +205,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
|||||||
tokens_b = None
|
tokens_b = None
|
||||||
if example.text_b:
|
if example.text_b:
|
||||||
tokens_b = tokenizer.tokenize(example.text_b)
|
tokens_b = tokenizer.tokenize(example.text_b)
|
||||||
|
|
||||||
if tokens_b:
|
|
||||||
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
||||||
# length is less than the specified length.
|
# length is less than the specified length.
|
||||||
# Account for [CLS], [SEP], [SEP] with "- 3"
|
# Account for [CLS], [SEP], [SEP] with "- 3"
|
||||||
@@ -216,7 +212,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
|||||||
else:
|
else:
|
||||||
# Account for [CLS] and [SEP] with "- 2"
|
# Account for [CLS] and [SEP] with "- 2"
|
||||||
if len(tokens_a) > max_seq_length - 2:
|
if len(tokens_a) > max_seq_length - 2:
|
||||||
tokens_a = tokens_a[0:(max_seq_length - 2)]
|
tokens_a = tokens_a[:(max_seq_length - 2)]
|
||||||
|
|
||||||
# The convention in BERT is:
|
# The convention in BERT is:
|
||||||
# (a) For sequence pairs:
|
# (a) For sequence pairs:
|
||||||
@@ -236,22 +232,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
|||||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
# For classification tasks, the first vector (corresponding to [CLS]) is
|
||||||
# used as as the "sentence vector". Note that this only makes sense because
|
# used as as the "sentence vector". Note that this only makes sense because
|
||||||
# the entire model is fine-tuned.
|
# the entire model is fine-tuned.
|
||||||
tokens = []
|
tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
|
||||||
segment_ids = []
|
segment_ids = [0] * len(tokens)
|
||||||
tokens.append("[CLS]")
|
|
||||||
segment_ids.append(0)
|
|
||||||
for token in tokens_a:
|
|
||||||
tokens.append(token)
|
|
||||||
segment_ids.append(0)
|
|
||||||
tokens.append("[SEP]")
|
|
||||||
segment_ids.append(0)
|
|
||||||
|
|
||||||
if tokens_b:
|
if tokens_b:
|
||||||
for token in tokens_b:
|
tokens += tokens_b + ["[SEP]"]
|
||||||
tokens.append(token)
|
segment_ids += [1] * (len(tokens_b) + 1)
|
||||||
segment_ids.append(1)
|
|
||||||
tokens.append("[SEP]")
|
|
||||||
segment_ids.append(1)
|
|
||||||
|
|
||||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||||
|
|
||||||
@@ -260,10 +246,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
|||||||
input_mask = [1] * len(input_ids)
|
input_mask = [1] * len(input_ids)
|
||||||
|
|
||||||
# Zero-pad up to the sequence length.
|
# Zero-pad up to the sequence length.
|
||||||
while len(input_ids) < max_seq_length:
|
padding = [0] * (max_seq_length - len(input_ids))
|
||||||
input_ids.append(0)
|
input_ids += padding
|
||||||
input_mask.append(0)
|
input_mask += padding
|
||||||
segment_ids.append(0)
|
segment_ids += padding
|
||||||
|
|
||||||
assert len(input_ids) == max_seq_length
|
assert len(input_ids) == max_seq_length
|
||||||
assert len(input_mask) == max_seq_length
|
assert len(input_mask) == max_seq_length
|
||||||
|
|||||||
Reference in New Issue
Block a user