diff --git a/examples/extract_features.py b/examples/extract_features.py index b24d6c9ad2..abe7fdffe7 100644 --- a/examples/extract_features.py +++ b/examples/extract_features.py @@ -28,7 +28,7 @@ import torch from torch.utils.data import TensorDataset, DataLoader, SequentialSampler from torch.utils.data.distributed import DistributedSampler -from pytorch_pretrained_bert.tokenization import convert_to_unicode, BertTokenizer +from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.modeling import BertModel logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', @@ -170,7 +170,7 @@ def read_examples(input_file): unique_id = 0 with open(input_file, "r") as reader: while True: - line = convert_to_unicode(reader.readline()) + line = reader.readline() if not line: break line = line.strip() diff --git a/examples/run_classifier.py b/examples/run_classifier.py index 5ceab4ae26..c6acc091ef 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -30,7 +30,7 @@ import torch from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler -from pytorch_pretrained_bert.tokenization import printable_text, convert_to_unicode, BertTokenizer +from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.modeling import BertForSequenceClassification from pytorch_pretrained_bert.optimization import BertAdam @@ -122,9 +122,9 @@ class MrpcProcessor(DataProcessor): if i == 0: continue guid = "%s-%s" % (set_type, i) - text_a = convert_to_unicode(line[3]) - text_b = convert_to_unicode(line[4]) - label = convert_to_unicode(line[0]) + text_a = line[3] + text_b = line[4] + label = line[0] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples @@ -154,10 +154,10 @@ class MnliProcessor(DataProcessor): for (i, line) in enumerate(lines): if i == 0: continue - guid = "%s-%s" % (set_type, convert_to_unicode(line[0])) - text_a = convert_to_unicode(line[8]) - text_b = convert_to_unicode(line[9]) - label = convert_to_unicode(line[-1]) + guid = "%s-%s" % (set_type, line[0]) + text_a = line[8]) + text_b = line[9]) + label = line[-1] examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples @@ -185,8 +185,8 @@ class ColaProcessor(DataProcessor): examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) - text_a = convert_to_unicode(line[3]) - label = convert_to_unicode(line[1]) + text_a = line[3] + label = line[1] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples @@ -273,7 +273,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("tokens: %s" % " ".join( - [printable_text(x) for x in tokens])) + [str(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info( diff --git a/examples/run_squad.py b/examples/run_squad.py index c13362b94e..00d5610afe 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -32,7 +32,7 @@ import torch from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler -from pytorch_pretrained_bert.tokenization import printable_text, whitespace_tokenize, BasicTokenizer, BertTokenizer +from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer from pytorch_pretrained_bert.modeling import BertForQuestionAnswering from pytorch_pretrained_bert.optimization import BertAdam @@ -64,9 +64,9 @@ class SquadExample(object): def __repr__(self): s = "" - s += "qas_id: %s" % (printable_text(self.qas_id)) + s += "qas_id: %s" % (self.qas_id) s += ", question_text: %s" % ( - printable_text(self.question_text)) + self.question_text) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) @@ -288,8 +288,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, logger.info("unique_id: %s" % (unique_id)) logger.info("example_index: %s" % (example_index)) logger.info("doc_span_index: %s" % (doc_span_index)) - logger.info("tokens: %s" % " ".join( - [printable_text(x) for x in tokens])) + logger.info("tokens: %s" % " ".join(tokens)) logger.info("token_to_orig_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) logger.info("token_is_max_context: %s" % " ".join([ @@ -305,7 +304,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, logger.info("start_position: %d" % (start_position)) logger.info("end_position: %d" % (end_position)) logger.info( - "answer: %s" % (printable_text(answer_text))) + "answer: %s" % (answer_text)) features.append( InputFeatures( diff --git a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb index d5e6bac68f..67c56ead38 100644 --- a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb +++ b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb @@ -133,7 +133,7 @@ " unique_id = 0\n", " with tf.gfile.GFile(input_file, \"r\") as reader:\n", " while True:\n", - " line = reader.readline()#tokenization.convert_to_unicode(reader.readline())\n", + " line = reader.readline()\n", " if not line:\n", " break\n", " line = line.strip()\n", diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index ab37539792..c37a7e3b9e 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -38,18 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = { 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", } -def printable_text(text): - """Returns text encoded in a way suitable for print or `tf.logging`.""" - - # These functions want `str` for both Python2 and Python3, but in one case - # it's a Unicode string and in the other it's a byte string. - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary."""