typo on tokenization
This commit is contained in:
@@ -23,7 +23,7 @@ import logging
|
|||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import tokenization
|
import tokenization_pytorch
|
||||||
import six
|
import six
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
@@ -62,9 +62,9 @@ class SquadExample(object):
|
|||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
s = ""
|
s = ""
|
||||||
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
|
s += "qas_id: %s" % (tokenization_pytorch.printable_text(self.qas_id))
|
||||||
s += ", question_text: %s" % (
|
s += ", question_text: %s" % (
|
||||||
tokenization.printable_text(self.question_text))
|
tokenization_pytorch.printable_text(self.question_text))
|
||||||
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
|
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
|
||||||
if self.start_position:
|
if self.start_position:
|
||||||
s += ", start_position: %d" % (self.start_position)
|
s += ", start_position: %d" % (self.start_position)
|
||||||
@@ -153,7 +153,7 @@ def read_squad_examples(input_file, is_training):
|
|||||||
# guaranteed to be preserved.
|
# guaranteed to be preserved.
|
||||||
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
|
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
|
||||||
cleaned_answer_text = " ".join(
|
cleaned_answer_text = " ".join(
|
||||||
tokenization.whitespace_tokenize(orig_answer_text))
|
tokenization_pytorch.whitespace_tokenize(orig_answer_text))
|
||||||
if actual_text.find(cleaned_answer_text) == -1:
|
if actual_text.find(cleaned_answer_text) == -1:
|
||||||
logger.warning("Could not find answer: '%s' vs. '%s'",
|
logger.warning("Could not find answer: '%s' vs. '%s'",
|
||||||
actual_text, cleaned_answer_text)
|
actual_text, cleaned_answer_text)
|
||||||
@@ -287,7 +287,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
|||||||
logger.info("example_index: %s" % (example_index))
|
logger.info("example_index: %s" % (example_index))
|
||||||
logger.info("doc_span_index: %s" % (doc_span_index))
|
logger.info("doc_span_index: %s" % (doc_span_index))
|
||||||
logger.info("tokens: %s" % " ".join(
|
logger.info("tokens: %s" % " ".join(
|
||||||
[tokenization.printable_text(x) for x in tokens]))
|
[tokenization_pytorch.printable_text(x) for x in tokens]))
|
||||||
logger.info("token_to_orig_map: %s" % " ".join(
|
logger.info("token_to_orig_map: %s" % " ".join(
|
||||||
["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)]))
|
["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)]))
|
||||||
logger.info("token_is_max_context: %s" % " ".join([
|
logger.info("token_is_max_context: %s" % " ".join([
|
||||||
@@ -303,7 +303,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
|||||||
logger.info("start_position: %d" % (start_position))
|
logger.info("start_position: %d" % (start_position))
|
||||||
logger.info("end_position: %d" % (end_position))
|
logger.info("end_position: %d" % (end_position))
|
||||||
logger.info(
|
logger.info(
|
||||||
"answer: %s" % (tokenization.printable_text(answer_text)))
|
"answer: %s" % (tokenization_pytorch.printable_text(answer_text)))
|
||||||
|
|
||||||
features.append(
|
features.append(
|
||||||
InputFeatures(
|
InputFeatures(
|
||||||
@@ -579,7 +579,7 @@ def get_final_text(pred_text, orig_text, do_lower_case):
|
|||||||
# and `pred_text`, and check if they are the same length. If they are
|
# and `pred_text`, and check if they are the same length. If they are
|
||||||
# NOT the same length, the heuristic has failed. If they are the same
|
# NOT the same length, the heuristic has failed. If they are the same
|
||||||
# length, we assume the characters are one-to-one aligned.
|
# length, we assume the characters are one-to-one aligned.
|
||||||
tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
|
tokenizer = tokenization_pytorch.BasicTokenizer(do_lower_case=do_lower_case)
|
||||||
|
|
||||||
tok_text = " ".join(tokenizer.tokenize(orig_text))
|
tok_text = " ".join(tokenizer.tokenize(orig_text))
|
||||||
|
|
||||||
@@ -780,7 +780,7 @@ def main():
|
|||||||
raise ValueError("Output directory () already exists and is not empty.")
|
raise ValueError("Output directory () already exists and is not empty.")
|
||||||
os.makedirs(args.output_dir, exist_ok=True)
|
os.makedirs(args.output_dir, exist_ok=True)
|
||||||
|
|
||||||
tokenizer = tokenization.FullTokenizer(
|
tokenizer = tokenization_pytorch.FullTokenizer(
|
||||||
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
|
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
|
||||||
|
|
||||||
train_examples = None
|
train_examples = None
|
||||||
|
|||||||
Reference in New Issue
Block a user