remove convert_to_unicode and printable_text from examples
This commit is contained in:
@@ -28,7 +28,7 @@ import torch
|
|||||||
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
|
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
|
||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
|
|
||||||
from pytorch_pretrained_bert.tokenization import convert_to_unicode, BertTokenizer
|
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||||
from pytorch_pretrained_bert.modeling import BertModel
|
from pytorch_pretrained_bert.modeling import BertModel
|
||||||
|
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
@@ -170,7 +170,7 @@ def read_examples(input_file):
|
|||||||
unique_id = 0
|
unique_id = 0
|
||||||
with open(input_file, "r") as reader:
|
with open(input_file, "r") as reader:
|
||||||
while True:
|
while True:
|
||||||
line = convert_to_unicode(reader.readline())
|
line = reader.readline()
|
||||||
if not line:
|
if not line:
|
||||||
break
|
break
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ import torch
|
|||||||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
|
|
||||||
from pytorch_pretrained_bert.tokenization import printable_text, convert_to_unicode, BertTokenizer
|
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||||
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
|
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
|
||||||
from pytorch_pretrained_bert.optimization import BertAdam
|
from pytorch_pretrained_bert.optimization import BertAdam
|
||||||
|
|
||||||
@@ -122,9 +122,9 @@ class MrpcProcessor(DataProcessor):
|
|||||||
if i == 0:
|
if i == 0:
|
||||||
continue
|
continue
|
||||||
guid = "%s-%s" % (set_type, i)
|
guid = "%s-%s" % (set_type, i)
|
||||||
text_a = convert_to_unicode(line[3])
|
text_a = line[3]
|
||||||
text_b = convert_to_unicode(line[4])
|
text_b = line[4]
|
||||||
label = convert_to_unicode(line[0])
|
label = line[0]
|
||||||
examples.append(
|
examples.append(
|
||||||
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||||
return examples
|
return examples
|
||||||
@@ -154,10 +154,10 @@ class MnliProcessor(DataProcessor):
|
|||||||
for (i, line) in enumerate(lines):
|
for (i, line) in enumerate(lines):
|
||||||
if i == 0:
|
if i == 0:
|
||||||
continue
|
continue
|
||||||
guid = "%s-%s" % (set_type, convert_to_unicode(line[0]))
|
guid = "%s-%s" % (set_type, line[0])
|
||||||
text_a = convert_to_unicode(line[8])
|
text_a = line[8])
|
||||||
text_b = convert_to_unicode(line[9])
|
text_b = line[9])
|
||||||
label = convert_to_unicode(line[-1])
|
label = line[-1]
|
||||||
examples.append(
|
examples.append(
|
||||||
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
||||||
return examples
|
return examples
|
||||||
@@ -185,8 +185,8 @@ class ColaProcessor(DataProcessor):
|
|||||||
examples = []
|
examples = []
|
||||||
for (i, line) in enumerate(lines):
|
for (i, line) in enumerate(lines):
|
||||||
guid = "%s-%s" % (set_type, i)
|
guid = "%s-%s" % (set_type, i)
|
||||||
text_a = convert_to_unicode(line[3])
|
text_a = line[3]
|
||||||
label = convert_to_unicode(line[1])
|
label = line[1]
|
||||||
examples.append(
|
examples.append(
|
||||||
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
|
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
|
||||||
return examples
|
return examples
|
||||||
@@ -273,7 +273,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
|||||||
logger.info("*** Example ***")
|
logger.info("*** Example ***")
|
||||||
logger.info("guid: %s" % (example.guid))
|
logger.info("guid: %s" % (example.guid))
|
||||||
logger.info("tokens: %s" % " ".join(
|
logger.info("tokens: %s" % " ".join(
|
||||||
[printable_text(x) for x in tokens]))
|
[str(x) for x in tokens]))
|
||||||
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
|
||||||
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ import torch
|
|||||||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
|
|
||||||
from pytorch_pretrained_bert.tokenization import printable_text, whitespace_tokenize, BasicTokenizer, BertTokenizer
|
from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer
|
||||||
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
|
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
|
||||||
from pytorch_pretrained_bert.optimization import BertAdam
|
from pytorch_pretrained_bert.optimization import BertAdam
|
||||||
|
|
||||||
@@ -64,9 +64,9 @@ class SquadExample(object):
|
|||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
s = ""
|
s = ""
|
||||||
s += "qas_id: %s" % (printable_text(self.qas_id))
|
s += "qas_id: %s" % (self.qas_id)
|
||||||
s += ", question_text: %s" % (
|
s += ", question_text: %s" % (
|
||||||
printable_text(self.question_text))
|
self.question_text)
|
||||||
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
|
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
|
||||||
if self.start_position:
|
if self.start_position:
|
||||||
s += ", start_position: %d" % (self.start_position)
|
s += ", start_position: %d" % (self.start_position)
|
||||||
@@ -288,8 +288,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
|||||||
logger.info("unique_id: %s" % (unique_id))
|
logger.info("unique_id: %s" % (unique_id))
|
||||||
logger.info("example_index: %s" % (example_index))
|
logger.info("example_index: %s" % (example_index))
|
||||||
logger.info("doc_span_index: %s" % (doc_span_index))
|
logger.info("doc_span_index: %s" % (doc_span_index))
|
||||||
logger.info("tokens: %s" % " ".join(
|
logger.info("tokens: %s" % " ".join(tokens))
|
||||||
[printable_text(x) for x in tokens]))
|
|
||||||
logger.info("token_to_orig_map: %s" % " ".join([
|
logger.info("token_to_orig_map: %s" % " ".join([
|
||||||
"%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
|
"%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
|
||||||
logger.info("token_is_max_context: %s" % " ".join([
|
logger.info("token_is_max_context: %s" % " ".join([
|
||||||
@@ -305,7 +304,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
|||||||
logger.info("start_position: %d" % (start_position))
|
logger.info("start_position: %d" % (start_position))
|
||||||
logger.info("end_position: %d" % (end_position))
|
logger.info("end_position: %d" % (end_position))
|
||||||
logger.info(
|
logger.info(
|
||||||
"answer: %s" % (printable_text(answer_text)))
|
"answer: %s" % (answer_text))
|
||||||
|
|
||||||
features.append(
|
features.append(
|
||||||
InputFeatures(
|
InputFeatures(
|
||||||
|
|||||||
@@ -133,7 +133,7 @@
|
|||||||
" unique_id = 0\n",
|
" unique_id = 0\n",
|
||||||
" with tf.gfile.GFile(input_file, \"r\") as reader:\n",
|
" with tf.gfile.GFile(input_file, \"r\") as reader:\n",
|
||||||
" while True:\n",
|
" while True:\n",
|
||||||
" line = reader.readline()#tokenization.convert_to_unicode(reader.readline())\n",
|
" line = reader.readline()\n",
|
||||||
" if not line:\n",
|
" if not line:\n",
|
||||||
" break\n",
|
" break\n",
|
||||||
" line = line.strip()\n",
|
" line = line.strip()\n",
|
||||||
|
|||||||
@@ -38,18 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
|
|||||||
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
|
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
|
||||||
}
|
}
|
||||||
|
|
||||||
def printable_text(text):
|
|
||||||
"""Returns text encoded in a way suitable for print or `tf.logging`."""
|
|
||||||
|
|
||||||
# These functions want `str` for both Python2 and Python3, but in one case
|
|
||||||
# it's a Unicode string and in the other it's a byte string.
|
|
||||||
if isinstance(text, str):
|
|
||||||
return text
|
|
||||||
elif isinstance(text, bytes):
|
|
||||||
return text.decode("utf-8", "ignore")
|
|
||||||
else:
|
|
||||||
raise ValueError("Unsupported string type: %s" % (type(text)))
|
|
||||||
|
|
||||||
|
|
||||||
def load_vocab(vocab_file):
|
def load_vocab(vocab_file):
|
||||||
"""Loads a vocabulary file into a dictionary."""
|
"""Loads a vocabulary file into a dictionary."""
|
||||||
|
|||||||
Reference in New Issue
Block a user