wip
This commit is contained in:
@@ -26,7 +26,8 @@ from .data import (is_sklearn_available,
|
||||
InputExample, InputFeatures, DataProcessor,
|
||||
glue_output_modes, glue_convert_examples_to_features,
|
||||
glue_processors, glue_tasks_num_labels,
|
||||
squad_convert_examples_to_features, SquadFeatures)
|
||||
squad_convert_examples_to_features, SquadFeatures,
|
||||
SquadExample, read_squad_examples)
|
||||
|
||||
if is_sklearn_available():
|
||||
from .data import glue_compute_metrics
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
|
||||
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||
from .processors import squad_convert_examples_to_features
|
||||
from .processors import squad_convert_examples_to_features, SquadExample, read_squad_examples
|
||||
|
||||
from .metrics import is_sklearn_available
|
||||
if is_sklearn_available():
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from .utils import InputExample, InputFeatures, DataProcessor
|
||||
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
|
||||
from .squad import squad_convert_examples_to_features, SquadFeatures
|
||||
from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, read_squad_examples
|
||||
|
||||
|
||||
@@ -2,7 +2,9 @@ from tqdm import tqdm
|
||||
import collections
|
||||
import logging
|
||||
import os
|
||||
import json
|
||||
|
||||
from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||
from .utils import DataProcessor, InputExample, InputFeatures
|
||||
from ...file_utils import is_tf_available
|
||||
|
||||
@@ -11,6 +13,7 @@ if is_tf_available():
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
doc_stride, max_query_length, is_training,
|
||||
cls_token_at_end=False,
|
||||
@@ -265,6 +268,125 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
|
||||
return features
|
||||
|
||||
|
||||
def read_squad_examples(input_file, is_training, version_2_with_negative):
|
||||
"""Read a SQuAD json file into a list of SquadExample."""
|
||||
with open(input_file, "r", encoding='utf-8') as reader:
|
||||
input_data = json.load(reader)["data"]
|
||||
|
||||
def is_whitespace(c):
|
||||
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
|
||||
return True
|
||||
return False
|
||||
|
||||
examples = []
|
||||
for entry in input_data:
|
||||
for paragraph in entry["paragraphs"]:
|
||||
paragraph_text = paragraph["context"]
|
||||
doc_tokens = []
|
||||
char_to_word_offset = []
|
||||
prev_is_whitespace = True
|
||||
for c in paragraph_text:
|
||||
if is_whitespace(c):
|
||||
prev_is_whitespace = True
|
||||
else:
|
||||
if prev_is_whitespace:
|
||||
doc_tokens.append(c)
|
||||
else:
|
||||
doc_tokens[-1] += c
|
||||
prev_is_whitespace = False
|
||||
char_to_word_offset.append(len(doc_tokens) - 1)
|
||||
|
||||
for qa in paragraph["qas"]:
|
||||
qas_id = qa["id"]
|
||||
question_text = qa["question"]
|
||||
start_position = None
|
||||
end_position = None
|
||||
orig_answer_text = None
|
||||
is_impossible = False
|
||||
if is_training:
|
||||
if version_2_with_negative:
|
||||
is_impossible = qa["is_impossible"]
|
||||
if (len(qa["answers"]) != 1) and (not is_impossible):
|
||||
raise ValueError(
|
||||
"For training, each question should have exactly 1 answer.")
|
||||
if not is_impossible:
|
||||
answer = qa["answers"][0]
|
||||
orig_answer_text = answer["text"]
|
||||
answer_offset = answer["answer_start"]
|
||||
answer_length = len(orig_answer_text)
|
||||
start_position = char_to_word_offset[answer_offset]
|
||||
end_position = char_to_word_offset[answer_offset + answer_length - 1]
|
||||
# Only add answers where the text can be exactly recovered from the
|
||||
# document. If this CAN'T happen it's likely due to weird Unicode
|
||||
# stuff so we will just skip the example.
|
||||
#
|
||||
# Note that this means for training mode, every example is NOT
|
||||
# guaranteed to be preserved.
|
||||
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
|
||||
cleaned_answer_text = " ".join(
|
||||
whitespace_tokenize(orig_answer_text))
|
||||
if actual_text.find(cleaned_answer_text) == -1:
|
||||
logger.warning("Could not find answer: '%s' vs. '%s'",
|
||||
actual_text, cleaned_answer_text)
|
||||
continue
|
||||
else:
|
||||
start_position = -1
|
||||
end_position = -1
|
||||
orig_answer_text = ""
|
||||
|
||||
example = SquadExample(
|
||||
qas_id=qas_id,
|
||||
question_text=question_text,
|
||||
doc_tokens=doc_tokens,
|
||||
orig_answer_text=orig_answer_text,
|
||||
start_position=start_position,
|
||||
end_position=end_position,
|
||||
is_impossible=is_impossible)
|
||||
examples.append(example)
|
||||
return examples
|
||||
|
||||
|
||||
class SquadExample(object):
|
||||
"""
|
||||
A single training/test example for the Squad dataset.
|
||||
For examples without an answer, the start and end position are -1.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
qas_id,
|
||||
question_text,
|
||||
doc_tokens,
|
||||
orig_answer_text=None,
|
||||
start_position=None,
|
||||
end_position=None,
|
||||
is_impossible=None):
|
||||
self.qas_id = qas_id
|
||||
self.question_text = question_text
|
||||
self.doc_tokens = doc_tokens
|
||||
self.orig_answer_text = orig_answer_text
|
||||
self.start_position = start_position
|
||||
self.end_position = end_position
|
||||
self.is_impossible = is_impossible
|
||||
|
||||
def __str__(self):
|
||||
return self.__repr__()
|
||||
|
||||
def __repr__(self):
|
||||
s = ""
|
||||
s += "qas_id: %s" % (self.qas_id)
|
||||
s += ", question_text: %s" % (
|
||||
self.question_text)
|
||||
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
|
||||
if self.start_position:
|
||||
s += ", start_position: %d" % (self.start_position)
|
||||
if self.end_position:
|
||||
s += ", end_position: %d" % (self.end_position)
|
||||
if self.is_impossible:
|
||||
s += ", is_impossible: %r" % (self.is_impossible)
|
||||
return s
|
||||
|
||||
|
||||
class SquadFeatures(object):
|
||||
"""A single set of features of data."""
|
||||
|
||||
|
||||
@@ -605,6 +605,10 @@ class PreTrainedTokenizer(object):
|
||||
vocabularies (BPE/SentencePieces/WordPieces).
|
||||
|
||||
Take care of added tokens.
|
||||
|
||||
text: The sequence to be encoded.
|
||||
return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
|
||||
**kwargs: passed to the child `self.tokenize()` method
|
||||
"""
|
||||
def split_on_token(tok, text):
|
||||
result = []
|
||||
|
||||
Reference in New Issue
Block a user