python 2 compatibility

This commit is contained in:
thomwolf
2019-02-06 00:07:46 +01:00
parent ba37ddc5ce
commit 448937c00d
17 changed files with 246 additions and 184 deletions

View File

@@ -17,26 +17,26 @@
Adapted from https://github.com/kimiyoung/transformer-xl.
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import sys
import functools
import argparse
import logging
import time
import math
import sys
from io import open
import torch
from pytorch_pretrained_bert import TransfoXLModel, TransfoXLCorpus
def logging(s, log_path, print_=True, log_=True):
if print_:
print(s)
if log_:
with open(log_path, 'a+') as f_log:
f_log.write(s + '\n')
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
def get_logger(log_path, **kwargs):
return functools.partial(logging, log_path=log_path, **kwargs)
parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
# parser.add_argument('--data', type=str, default='../data/wikitext-103',
@@ -71,8 +71,8 @@ assert args.ext_len >= 0, 'extended context length must be non-negative'
device = torch.device("cuda" if args.cuda else "cpu")
# Get logger
logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
log_=not args.no_log)
# logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
# log_=not args.no_log)
# Load dataset
corpus = TransfoXLCorpus.from_pretrained(args.model_name)
@@ -90,7 +90,7 @@ te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
model = TransfoXLModel.from_pretrained(args.model_name)
model = model.to(device)
logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
@@ -116,7 +116,7 @@ def evaluate(eval_iter):
total_loss += seq_len * loss.item()
total_len += seq_len
total_time = time.time() - start_time
logging('Time : {:.2f}s, {:.2f}ms/segment'.format(
logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
total_time, 1000 * total_time / (idx+1)))
return total_loss / total_len
@@ -146,6 +146,6 @@ if valid_loss is not None:
if test_loss is not None:
log_str += format_log(test_loss, 'test')
logging('=' * 100)
logging(log_str)
logging('=' * 100)
logger.info('=' * 100)
logger.info(log_str)
logger.info('=' * 100)

View File

@@ -15,26 +15,27 @@
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import, division, print_function
import csv
import os
import logging
import argparse
import csv
import logging
import os
import random
from tqdm import tqdm, trange
import sys
from io import open
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.tokenization import BertTokenizer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
@@ -91,10 +92,12 @@ class DataProcessor(object):
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding='utf-8') as f:
with open(input_file, "rb") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
return lines
@@ -429,7 +432,8 @@ def main():
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
os.makedirs(args.output_dir, exist_ok=True)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
task_name = args.task_name.lower()
@@ -451,7 +455,7 @@ def main():
# Prepare model
model = BertForSequenceClassification.from_pretrained(args.bert_model,
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
num_labels = num_labels)
if args.fp16:
model.half()

View File

@@ -15,26 +15,23 @@
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import logging
import argparse
from tqdm import tqdm, trange
import logging
import os
import random
from io import open
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler
from torch.utils.data import DataLoader, Dataset, RandomSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining
from pytorch_pretrained_bert.optimization import BertAdam
from torch.utils.data import Dataset
import random
from pytorch_pretrained_bert.tokenization import BertTokenizer
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
@@ -185,16 +182,16 @@ class BERTDataset(Dataset):
if self.line_buffer is None:
# read first non-empty line of file
while t1 == "" :
t1 = self.file.__next__().strip()
t2 = self.file.__next__().strip()
t1 = next(self.file).strip()
t2 = next(self.file).strip()
else:
# use t2 from previous iteration as new t1
t1 = self.line_buffer
t2 = self.file.__next__().strip()
t2 = next(self.file).strip()
# skip empty rows that are used for separating documents and keep track of current doc id
while t2 == "" or t1 == "":
t1 = self.file.__next__().strip()
t2 = self.file.__next__().strip()
t1 = next(self.file).strip()
t2 = next(self.file).strip()
self.current_doc = self.current_doc+1
self.line_buffer = t2
@@ -228,15 +225,15 @@ class BERTDataset(Dataset):
def get_next_line(self):
""" Gets next line of random_file and starts over when reaching end of file"""
try:
line = self.random_file.__next__().strip()
line = next(self.random_file).strip()
#keep track of which document we are currently looking at to later avoid having the same doc as t1
if line == "":
self.current_random_doc = self.current_random_doc + 1
line = self.random_file.__next__().strip()
line = next(self.random_file).strip()
except StopIteration:
self.random_file.close()
self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
line = self.random_file.__next__().strip()
line = next(self.random_file).strip()
return line
@@ -425,6 +422,7 @@ def main():
help="The output directory where the model checkpoints will be written.")
## Other parameters
parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
parser.add_argument("--max_seq_length",
default=128,
type=int,
@@ -513,7 +511,8 @@ def main():
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
os.makedirs(args.output_dir, exist_ok=True)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
@@ -579,7 +578,7 @@ def main():
if args.local_rank == -1:
train_sampler = RandomSampler(train_dataset)
else:
#TODO: check if this works with current data generator from disk that relies on file.__next__
#TODO: check if this works with current data generator from disk that relies on next(file)
# (it doesn't return item back by index)
train_sampler = DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
@@ -643,4 +642,4 @@ def accuracy(out, labels):
if __name__ == "__main__":
main()
main()

View File

@@ -15,29 +15,36 @@
# limitations under the License.
"""Run BERT on SQuAD."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import, division, print_function
import argparse
import collections
import logging
import json
import logging
import math
import os
import random
import pickle
from tqdm import tqdm, trange
import sys
from io import open
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
BertTokenizer,
whitespace_tokenize)
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
@@ -784,7 +791,8 @@ def main():
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
raise ValueError("Output directory () already exists and is not empty.")
os.makedirs(args.output_dir, exist_ok=True)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
@@ -798,7 +806,7 @@ def main():
# Prepare model
model = BertForQuestionAnswering.from_pretrained(args.bert_model,
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)))
if args.fp16:
model.half()

View File

@@ -15,22 +15,25 @@
# limitations under the License.
"""BERT finetuning runner."""
import argparse
import csv
import logging
import os
import argparse
import random
from tqdm import tqdm, trange
import csv
import sys
from io import open
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForMultipleChoice
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.tokenization import BertTokenizer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
@@ -65,17 +68,17 @@ class SwagExample(object):
def __repr__(self):
l = [
f"swag_id: {self.swag_id}",
f"context_sentence: {self.context_sentence}",
f"start_ending: {self.start_ending}",
f"ending_0: {self.endings[0]}",
f"ending_1: {self.endings[1]}",
f"ending_2: {self.endings[2]}",
f"ending_3: {self.endings[3]}",
"swag_id: {}".format(self.swag_id),
"context_sentence: {}".format(self.context_sentence),
"start_ending: {}".format(self.start_ending),
"ending_0: {}".format(self.endings[0]),
"ending_1: {}".format(self.endings[1]),
"ending_2: {}".format(self.endings[2]),
"ending_3: {}".format(self.endings[3]),
]
if self.label is not None:
l.append(f"label: {self.label}")
l.append("label: {}".format(self.label))
return ", ".join(l)
@@ -102,7 +105,11 @@ class InputFeatures(object):
def read_swag_examples(input_file, is_training):
with open(input_file, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
lines = list(reader)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
if is_training and lines[0][-1] != 'label':
raise ValueError(
@@ -184,15 +191,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
label = example.label
if example_index < 5:
logger.info("*** Example ***")
logger.info(f"swag_id: {example.swag_id}")
logger.info("swag_id: {}".format(example.swag_id))
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
logger.info(f"choice: {choice_idx}")
logger.info(f"tokens: {' '.join(tokens)}")
logger.info(f"input_ids: {' '.join(map(str, input_ids))}")
logger.info(f"input_mask: {' '.join(map(str, input_mask))}")
logger.info(f"segment_ids: {' '.join(map(str, segment_ids))}")
logger.info("choice: {}".format(choice_idx))
logger.info("tokens: {}".format(' '.join(tokens)))
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
if is_training:
logger.info(f"label: {label}")
logger.info("label: {}".format(label))
features.append(
InputFeatures(
@@ -349,7 +356,8 @@ def main():
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
os.makedirs(args.output_dir, exist_ok=True)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
@@ -362,7 +370,7 @@ def main():
# Prepare model
model = BertForMultipleChoice.from_pretrained(args.bert_model,
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
num_choices=4)
if args.fp16:
model.half()