python 2 compatibility
This commit is contained in:
@@ -17,26 +17,26 @@
|
||||
Adapted from https://github.com/kimiyoung/transformer-xl.
|
||||
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import sys
|
||||
import functools
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
import math
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
|
||||
from pytorch_pretrained_bert import TransfoXLModel, TransfoXLCorpus
|
||||
|
||||
def logging(s, log_path, print_=True, log_=True):
|
||||
if print_:
|
||||
print(s)
|
||||
if log_:
|
||||
with open(log_path, 'a+') as f_log:
|
||||
f_log.write(s + '\n')
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_logger(log_path, **kwargs):
|
||||
return functools.partial(logging, log_path=log_path, **kwargs)
|
||||
|
||||
parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
|
||||
# parser.add_argument('--data', type=str, default='../data/wikitext-103',
|
||||
@@ -71,8 +71,8 @@ assert args.ext_len >= 0, 'extended context length must be non-negative'
|
||||
device = torch.device("cuda" if args.cuda else "cpu")
|
||||
|
||||
# Get logger
|
||||
logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
|
||||
log_=not args.no_log)
|
||||
# logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
|
||||
# log_=not args.no_log)
|
||||
|
||||
# Load dataset
|
||||
corpus = TransfoXLCorpus.from_pretrained(args.model_name)
|
||||
@@ -90,7 +90,7 @@ te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
|
||||
model = TransfoXLModel.from_pretrained(args.model_name)
|
||||
model = model.to(device)
|
||||
|
||||
logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
|
||||
logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
|
||||
args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
|
||||
|
||||
model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
|
||||
@@ -116,7 +116,7 @@ def evaluate(eval_iter):
|
||||
total_loss += seq_len * loss.item()
|
||||
total_len += seq_len
|
||||
total_time = time.time() - start_time
|
||||
logging('Time : {:.2f}s, {:.2f}ms/segment'.format(
|
||||
logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
|
||||
total_time, 1000 * total_time / (idx+1)))
|
||||
return total_loss / total_len
|
||||
|
||||
@@ -146,6 +146,6 @@ if valid_loss is not None:
|
||||
if test_loss is not None:
|
||||
log_str += format_log(test_loss, 'test')
|
||||
|
||||
logging('=' * 100)
|
||||
logging(log_str)
|
||||
logging('=' * 100)
|
||||
logger.info('=' * 100)
|
||||
logger.info(log_str)
|
||||
logger.info('=' * 100)
|
||||
|
||||
@@ -15,26 +15,27 @@
|
||||
# limitations under the License.
|
||||
"""BERT finetuning runner."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import csv
|
||||
import os
|
||||
import logging
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from tqdm import tqdm, trange
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
|
||||
from pytorch_pretrained_bert.optimization import BertAdam
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
@@ -91,10 +92,12 @@ class DataProcessor(object):
|
||||
@classmethod
|
||||
def _read_tsv(cls, input_file, quotechar=None):
|
||||
"""Reads a tab separated value file."""
|
||||
with open(input_file, "r", encoding='utf-8') as f:
|
||||
with open(input_file, "rb") as f:
|
||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
@@ -429,7 +432,8 @@ def main():
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
|
||||
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
task_name = args.task_name.lower()
|
||||
|
||||
@@ -451,7 +455,7 @@ def main():
|
||||
|
||||
# Prepare model
|
||||
model = BertForSequenceClassification.from_pretrained(args.bert_model,
|
||||
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
|
||||
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
|
||||
num_labels = num_labels)
|
||||
if args.fp16:
|
||||
model.half()
|
||||
|
||||
@@ -15,26 +15,23 @@
|
||||
# limitations under the License.
|
||||
"""BERT finetuning runner."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import logging
|
||||
import argparse
|
||||
from tqdm import tqdm, trange
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from io import open
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, RandomSampler
|
||||
from torch.utils.data import DataLoader, Dataset, RandomSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
from pytorch_pretrained_bert.modeling import BertForPreTraining
|
||||
from pytorch_pretrained_bert.optimization import BertAdam
|
||||
|
||||
from torch.utils.data import Dataset
|
||||
import random
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt='%m/%d/%Y %H:%M:%S',
|
||||
@@ -185,16 +182,16 @@ class BERTDataset(Dataset):
|
||||
if self.line_buffer is None:
|
||||
# read first non-empty line of file
|
||||
while t1 == "" :
|
||||
t1 = self.file.__next__().strip()
|
||||
t2 = self.file.__next__().strip()
|
||||
t1 = next(self.file).strip()
|
||||
t2 = next(self.file).strip()
|
||||
else:
|
||||
# use t2 from previous iteration as new t1
|
||||
t1 = self.line_buffer
|
||||
t2 = self.file.__next__().strip()
|
||||
t2 = next(self.file).strip()
|
||||
# skip empty rows that are used for separating documents and keep track of current doc id
|
||||
while t2 == "" or t1 == "":
|
||||
t1 = self.file.__next__().strip()
|
||||
t2 = self.file.__next__().strip()
|
||||
t1 = next(self.file).strip()
|
||||
t2 = next(self.file).strip()
|
||||
self.current_doc = self.current_doc+1
|
||||
self.line_buffer = t2
|
||||
|
||||
@@ -228,15 +225,15 @@ class BERTDataset(Dataset):
|
||||
def get_next_line(self):
|
||||
""" Gets next line of random_file and starts over when reaching end of file"""
|
||||
try:
|
||||
line = self.random_file.__next__().strip()
|
||||
line = next(self.random_file).strip()
|
||||
#keep track of which document we are currently looking at to later avoid having the same doc as t1
|
||||
if line == "":
|
||||
self.current_random_doc = self.current_random_doc + 1
|
||||
line = self.random_file.__next__().strip()
|
||||
line = next(self.random_file).strip()
|
||||
except StopIteration:
|
||||
self.random_file.close()
|
||||
self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
|
||||
line = self.random_file.__next__().strip()
|
||||
line = next(self.random_file).strip()
|
||||
return line
|
||||
|
||||
|
||||
@@ -425,6 +422,7 @@ def main():
|
||||
help="The output directory where the model checkpoints will be written.")
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
|
||||
parser.add_argument("--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
@@ -513,7 +511,8 @@ def main():
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
|
||||
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
@@ -579,7 +578,7 @@ def main():
|
||||
if args.local_rank == -1:
|
||||
train_sampler = RandomSampler(train_dataset)
|
||||
else:
|
||||
#TODO: check if this works with current data generator from disk that relies on file.__next__
|
||||
#TODO: check if this works with current data generator from disk that relies on next(file)
|
||||
# (it doesn't return item back by index)
|
||||
train_sampler = DistributedSampler(train_dataset)
|
||||
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
@@ -643,4 +642,4 @@ def accuracy(out, labels):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -15,29 +15,36 @@
|
||||
# limitations under the License.
|
||||
"""Run BERT on SQuAD."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import collections
|
||||
import logging
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import pickle
|
||||
from tqdm import tqdm, trange
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
|
||||
from pytorch_pretrained_bert.optimization import BertAdam
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
|
||||
BertTokenizer,
|
||||
whitespace_tokenize)
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
else:
|
||||
import pickle
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
@@ -784,7 +791,8 @@ def main():
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
|
||||
raise ValueError("Output directory () already exists and is not empty.")
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
@@ -798,7 +806,7 @@ def main():
|
||||
|
||||
# Prepare model
|
||||
model = BertForQuestionAnswering.from_pretrained(args.bert_model,
|
||||
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))
|
||||
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)))
|
||||
|
||||
if args.fp16:
|
||||
model.half()
|
||||
|
||||
@@ -15,22 +15,25 @@
|
||||
# limitations under the License.
|
||||
"""BERT finetuning runner."""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import argparse
|
||||
import random
|
||||
from tqdm import tqdm, trange
|
||||
import csv
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.modeling import BertForMultipleChoice
|
||||
from pytorch_pretrained_bert.optimization import BertAdam
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
@@ -65,17 +68,17 @@ class SwagExample(object):
|
||||
|
||||
def __repr__(self):
|
||||
l = [
|
||||
f"swag_id: {self.swag_id}",
|
||||
f"context_sentence: {self.context_sentence}",
|
||||
f"start_ending: {self.start_ending}",
|
||||
f"ending_0: {self.endings[0]}",
|
||||
f"ending_1: {self.endings[1]}",
|
||||
f"ending_2: {self.endings[2]}",
|
||||
f"ending_3: {self.endings[3]}",
|
||||
"swag_id: {}".format(self.swag_id),
|
||||
"context_sentence: {}".format(self.context_sentence),
|
||||
"start_ending: {}".format(self.start_ending),
|
||||
"ending_0: {}".format(self.endings[0]),
|
||||
"ending_1: {}".format(self.endings[1]),
|
||||
"ending_2: {}".format(self.endings[2]),
|
||||
"ending_3: {}".format(self.endings[3]),
|
||||
]
|
||||
|
||||
if self.label is not None:
|
||||
l.append(f"label: {self.label}")
|
||||
l.append("label: {}".format(self.label))
|
||||
|
||||
return ", ".join(l)
|
||||
|
||||
@@ -102,7 +105,11 @@ class InputFeatures(object):
|
||||
def read_swag_examples(input_file, is_training):
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
lines = list(reader)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||
lines.append(line)
|
||||
|
||||
if is_training and lines[0][-1] != 'label':
|
||||
raise ValueError(
|
||||
@@ -184,15 +191,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
label = example.label
|
||||
if example_index < 5:
|
||||
logger.info("*** Example ***")
|
||||
logger.info(f"swag_id: {example.swag_id}")
|
||||
logger.info("swag_id: {}".format(example.swag_id))
|
||||
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
|
||||
logger.info(f"choice: {choice_idx}")
|
||||
logger.info(f"tokens: {' '.join(tokens)}")
|
||||
logger.info(f"input_ids: {' '.join(map(str, input_ids))}")
|
||||
logger.info(f"input_mask: {' '.join(map(str, input_mask))}")
|
||||
logger.info(f"segment_ids: {' '.join(map(str, segment_ids))}")
|
||||
logger.info("choice: {}".format(choice_idx))
|
||||
logger.info("tokens: {}".format(' '.join(tokens)))
|
||||
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
|
||||
logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
|
||||
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
|
||||
if is_training:
|
||||
logger.info(f"label: {label}")
|
||||
logger.info("label: {}".format(label))
|
||||
|
||||
features.append(
|
||||
InputFeatures(
|
||||
@@ -349,7 +356,8 @@ def main():
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
|
||||
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
@@ -362,7 +370,7 @@ def main():
|
||||
|
||||
# Prepare model
|
||||
model = BertForMultipleChoice.from_pretrained(args.bert_model,
|
||||
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
|
||||
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
|
||||
num_choices=4)
|
||||
if args.fp16:
|
||||
model.half()
|
||||
|
||||
Reference in New Issue
Block a user