Merge pull request #254 from huggingface/python_2
Adding OpenAI GPT and Transformer-XL models, compatibility with Python 2
This commit is contained in:
@@ -80,10 +80,10 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
|
||||
# The convention in BERT is:
|
||||
# (a) For sequence pairs:
|
||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||
# (b) For single sequences:
|
||||
# tokens: [CLS] the dog is hairy . [SEP]
|
||||
# type_ids: 0 0 0 0 0 0 0
|
||||
# type_ids: 0 0 0 0 0 0 0
|
||||
#
|
||||
# Where "type_ids" are used to indicate whether this is the first
|
||||
# sequence or the second sequence. The embedding vectors for `type=0` and
|
||||
|
||||
@@ -15,26 +15,26 @@
|
||||
# limitations under the License.
|
||||
"""BERT finetuning runner."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import csv
|
||||
import os
|
||||
import logging
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from tqdm import tqdm, trange
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
|
||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
@@ -91,10 +91,12 @@ class DataProcessor(object):
|
||||
@classmethod
|
||||
def _read_tsv(cls, input_file, quotechar=None):
|
||||
"""Reads a tab separated value file."""
|
||||
with open(input_file, "r", encoding='utf-8') as f:
|
||||
with open(input_file, "r") as f:
|
||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
@@ -321,6 +323,10 @@ def main():
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--cache_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Where do you want to store the pre-trained models downloaded from s3")
|
||||
parser.add_argument("--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
@@ -380,9 +386,17 @@ def main():
|
||||
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
|
||||
"0 (default value): dynamic loss scaling.\n"
|
||||
"Positive power of 2: static loss scaling value.\n")
|
||||
|
||||
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
processors = {
|
||||
"cola": ColaProcessor,
|
||||
"mnli": MnliProcessor,
|
||||
@@ -424,7 +438,8 @@ def main():
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
|
||||
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
task_name = args.task_name.lower()
|
||||
|
||||
@@ -447,8 +462,9 @@ def main():
|
||||
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
|
||||
|
||||
# Prepare model
|
||||
cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))
|
||||
model = BertForSequenceClassification.from_pretrained(args.bert_model,
|
||||
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
|
||||
cache_dir=cache_dir,
|
||||
num_labels = num_labels)
|
||||
if args.fp16:
|
||||
model.half()
|
||||
@@ -545,15 +561,21 @@ def main():
|
||||
optimizer.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
# Save a trained model
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
|
||||
if args.do_train:
|
||||
# Save a trained model and the associated configuration
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
||||
with open(output_config_file, 'w') as f:
|
||||
f.write(model_to_save.config.to_json_string())
|
||||
|
||||
# Load a trained model that you have fine-tuned
|
||||
model_state_dict = torch.load(output_model_file)
|
||||
model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels)
|
||||
# Load a trained model and config that you have fine-tuned
|
||||
config = BertConfig(output_config_file)
|
||||
model = BertForSequenceClassification(config, num_labels=num_labels)
|
||||
model.load_state_dict(torch.load(output_model_file))
|
||||
else:
|
||||
model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
|
||||
model.to(device)
|
||||
|
||||
if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
|
||||
@@ -15,22 +15,22 @@
|
||||
# limitations under the License.
|
||||
"""BERT finetuning runner."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import logging
|
||||
import argparse
|
||||
from tqdm import tqdm, trange
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from io import open
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, RandomSampler
|
||||
from torch.utils.data import DataLoader, Dataset, RandomSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
from pytorch_pretrained_bert.modeling import BertForPreTraining
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||
|
||||
from torch.utils.data import Dataset
|
||||
@@ -179,16 +179,16 @@ class BERTDataset(Dataset):
|
||||
if self.line_buffer is None:
|
||||
# read first non-empty line of file
|
||||
while t1 == "" :
|
||||
t1 = self.file.__next__().strip()
|
||||
t2 = self.file.__next__().strip()
|
||||
t1 = next(self.file).strip()
|
||||
t2 = next(self.file).strip()
|
||||
else:
|
||||
# use t2 from previous iteration as new t1
|
||||
t1 = self.line_buffer
|
||||
t2 = self.file.__next__().strip()
|
||||
t2 = next(self.file).strip()
|
||||
# skip empty rows that are used for separating documents and keep track of current doc id
|
||||
while t2 == "" or t1 == "":
|
||||
t1 = self.file.__next__().strip()
|
||||
t2 = self.file.__next__().strip()
|
||||
t1 = next(self.file).strip()
|
||||
t2 = next(self.file).strip()
|
||||
self.current_doc = self.current_doc+1
|
||||
self.line_buffer = t2
|
||||
|
||||
@@ -222,15 +222,15 @@ class BERTDataset(Dataset):
|
||||
def get_next_line(self):
|
||||
""" Gets next line of random_file and starts over when reaching end of file"""
|
||||
try:
|
||||
line = self.random_file.__next__().strip()
|
||||
line = next(self.random_file).strip()
|
||||
#keep track of which document we are currently looking at to later avoid having the same doc as t1
|
||||
if line == "":
|
||||
self.current_random_doc = self.current_random_doc + 1
|
||||
line = self.random_file.__next__().strip()
|
||||
line = next(self.random_file).strip()
|
||||
except StopIteration:
|
||||
self.random_file.close()
|
||||
self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
|
||||
line = self.random_file.__next__().strip()
|
||||
line = next(self.random_file).strip()
|
||||
return line
|
||||
|
||||
|
||||
@@ -419,6 +419,7 @@ def main():
|
||||
help="The output directory where the model checkpoints will be written.")
|
||||
|
||||
## Other parameters
|
||||
parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
|
||||
parser.add_argument("--max_seq_length",
|
||||
default=128,
|
||||
type=int,
|
||||
@@ -506,7 +507,8 @@ def main():
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
|
||||
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
@@ -575,7 +577,7 @@ def main():
|
||||
if args.local_rank == -1:
|
||||
train_sampler = RandomSampler(train_dataset)
|
||||
else:
|
||||
#TODO: check if this works with current data generator from disk that relies on file.__next__
|
||||
#TODO: check if this works with current data generator from disk that relies on next(file)
|
||||
# (it doesn't return item back by index)
|
||||
train_sampler = DistributedSampler(train_dataset)
|
||||
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
@@ -641,4 +643,4 @@ def accuracy(out, labels):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
259
examples/run_openai_gpt.py
Normal file
259
examples/run_openai_gpt.py
Normal file
@@ -0,0 +1,259 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" OpenAI GPT model fine-tuning script.
|
||||
Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
|
||||
It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
|
||||
|
||||
This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import csv
|
||||
import random
|
||||
import logging
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
|
||||
from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path
|
||||
|
||||
ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def accuracy(out, labels):
|
||||
outputs = np.argmax(out, axis=1)
|
||||
return np.sum(outputs == labels)
|
||||
|
||||
def load_rocstories_dataset(dataset_path):
|
||||
""" Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
|
||||
with open(dataset_path, encoding='utf_8') as f:
|
||||
f = csv.reader(f)
|
||||
output = []
|
||||
next(f) # skip the first line
|
||||
for line in tqdm(f):
|
||||
output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
|
||||
return output
|
||||
|
||||
def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
|
||||
""" Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
|
||||
|
||||
To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
|
||||
input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
||||
"""
|
||||
tensor_datasets = []
|
||||
for dataset in encoded_datasets:
|
||||
n_batch = len(dataset)
|
||||
input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
|
||||
mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
|
||||
lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
|
||||
mc_labels = np.zeros((n_batch,), dtype=np.int64)
|
||||
for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
|
||||
with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
||||
with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
|
||||
input_ids[i, 0, :len(with_cont1)] = with_cont1
|
||||
input_ids[i, 1, :len(with_cont2)] = with_cont2
|
||||
mc_token_ids[i, 0] = len(with_cont1) - 1
|
||||
mc_token_ids[i, 1] = len(with_cont2) - 1
|
||||
lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
|
||||
lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
|
||||
mc_labels[i] = mc_label
|
||||
all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
|
||||
tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
|
||||
return tensor_datasets
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_name', type=str, default='openai-gpt',
|
||||
help='pretrained model name')
|
||||
parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
|
||||
parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
|
||||
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
||||
help="The output directory where the model predictions and checkpoints will be written.")
|
||||
parser.add_argument('--train_dataset', type=str, default='')
|
||||
parser.add_argument('--eval_dataset', type=str, default='')
|
||||
parser.add_argument('--seed', type=int, default=42)
|
||||
parser.add_argument('--num_train_epochs', type=int, default=3)
|
||||
parser.add_argument('--train_batch_size', type=int, default=8)
|
||||
parser.add_argument('--eval_batch_size', type=int, default=16)
|
||||
parser.add_argument('--max_grad_norm', type=int, default=1)
|
||||
parser.add_argument('--learning_rate', type=float, default=6.25e-5)
|
||||
parser.add_argument('--warmup_proportion', type=float, default=0.002)
|
||||
parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
|
||||
parser.add_argument('--weight_decay', type=float, default=0.01)
|
||||
parser.add_argument('--lm_coef', type=float, default=0.9)
|
||||
parser.add_argument('--n_valid', type=int, default=374)
|
||||
|
||||
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
logger.info("device: {}, n_gpu {}".format(device, n_gpu))
|
||||
|
||||
if not args.do_train and not args.do_eval:
|
||||
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
|
||||
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
# Load tokenizer and model
|
||||
# This loading functions also add new tokens and embeddings called `special tokens`
|
||||
# These new embeddings will be fine-tuned on the RocStories dataset
|
||||
special_tokens = ['_start_', '_delimiter_', '_classify_']
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
|
||||
special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
|
||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
|
||||
model.to(device)
|
||||
|
||||
# Load and encode the datasets
|
||||
if not args.train_dataset and not args.eval_dataset:
|
||||
roc_stories = cached_path(ROCSTORIES_URL)
|
||||
def tokenize_and_encode(obj):
|
||||
""" Tokenize and encode a nested object """
|
||||
if isinstance(obj, str):
|
||||
return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
|
||||
elif isinstance(obj, int):
|
||||
return obj
|
||||
return list(tokenize_and_encode(o) for o in obj)
|
||||
logger.info("Encoding dataset...")
|
||||
train_dataset = load_rocstories_dataset(args.train_dataset)
|
||||
eval_dataset = load_rocstories_dataset(args.eval_dataset)
|
||||
datasets = (train_dataset, eval_dataset)
|
||||
encoded_datasets = tokenize_and_encode(datasets)
|
||||
|
||||
# Compute the mex input length for the Transformer
|
||||
max_length = model.config.n_positions // 2 - 2
|
||||
input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \
|
||||
for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
|
||||
input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model
|
||||
|
||||
# Prepare inputs tensors and dataloaders
|
||||
tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
|
||||
train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
|
||||
|
||||
train_data = TensorDataset(*train_tensor_dataset)
|
||||
train_sampler = RandomSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
eval_data = TensorDataset(*eval_tensor_dataset)
|
||||
eval_sampler = SequentialSampler(eval_data)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
# Prepare optimizer
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
|
||||
optimizer = OpenAIAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
max_grad_norm=args.max_grad_norm,
|
||||
weight_decay=args.weight_decay,
|
||||
t_total=num_train_optimization_steps)
|
||||
|
||||
if args.do_train:
|
||||
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
|
||||
model.train()
|
||||
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||
tr_loss = 0
|
||||
nb_tr_steps = 0
|
||||
tqdm_bar = tqdm(train_dataloader, desc="Training")
|
||||
for step, batch in enumerate(tqdm_bar):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
input_ids, mc_token_ids, lm_labels, mc_labels = batch
|
||||
losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
|
||||
loss = args.lm_coef * losses[0] + losses[1]
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
tr_loss += loss.item()
|
||||
exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
|
||||
nb_tr_steps += 1
|
||||
tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])
|
||||
|
||||
# Save a trained model
|
||||
if args.do_train:
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
|
||||
config = model.config
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
|
||||
# Load a trained model that you have fine-tuned
|
||||
model_state_dict = torch.load(output_model_file)
|
||||
model = OpenAIGPTDoubleHeadsModel(config)
|
||||
model.load_state_dict(model_state_dict)
|
||||
model.to(device)
|
||||
|
||||
if args.do_eval:
|
||||
model.eval()
|
||||
eval_loss, eval_accuracy = 0, 0
|
||||
nb_eval_steps, nb_eval_examples = 0, 0
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
input_ids, mc_token_ids, lm_labels, mc_labels = batch
|
||||
with torch.no_grad():
|
||||
_, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels)
|
||||
_, mc_logits = model(input_ids, mc_token_ids)
|
||||
|
||||
mc_logits = mc_logits.detach().cpu().numpy()
|
||||
mc_labels = mc_labels.to('cpu').numpy()
|
||||
tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
|
||||
|
||||
eval_loss += mc_loss.mean().item()
|
||||
eval_accuracy += tmp_eval_accuracy
|
||||
|
||||
nb_eval_examples += input_ids.size(0)
|
||||
nb_eval_steps += 1
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
eval_accuracy = eval_accuracy / nb_eval_examples
|
||||
train_loss = tr_loss/nb_tr_steps if args.do_train else None
|
||||
result = {'eval_loss': eval_loss,
|
||||
'eval_accuracy': eval_accuracy,
|
||||
'train_loss': train_loss}
|
||||
|
||||
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -15,29 +15,36 @@
|
||||
# limitations under the License.
|
||||
"""Run BERT on SQuAD."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import collections
|
||||
import logging
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import pickle
|
||||
from tqdm import tqdm, trange
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer
|
||||
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
|
||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
|
||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||
from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
|
||||
BertTokenizer,
|
||||
whitespace_tokenize)
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
else:
|
||||
import pickle
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
@@ -863,7 +870,8 @@ def main():
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
|
||||
raise ValueError("Output directory () already exists and is not empty.")
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
@@ -879,7 +887,7 @@ def main():
|
||||
|
||||
# Prepare model
|
||||
model = BertForQuestionAnswering.from_pretrained(args.bert_model,
|
||||
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))
|
||||
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)))
|
||||
|
||||
if args.fp16:
|
||||
model.half()
|
||||
@@ -909,7 +917,7 @@ def main():
|
||||
|
||||
if args.fp16:
|
||||
try:
|
||||
from apex.optimizer import FP16_Optimizer
|
||||
from apex.optimizers import FP16_Optimizer
|
||||
from apex.optimizers import FusedAdam
|
||||
except ImportError:
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
|
||||
@@ -993,14 +1001,19 @@ def main():
|
||||
optimizer.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
# Save a trained model
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
|
||||
if args.do_train:
|
||||
# Save a trained model and the associated configuration
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
# Load a trained model that you have fine-tuned
|
||||
model_state_dict = torch.load(output_model_file)
|
||||
model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict)
|
||||
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
||||
with open(output_config_file, 'w') as f:
|
||||
f.write(model_to_save.config.to_json_string())
|
||||
|
||||
# Load a trained model and config that you have fine-tuned
|
||||
config = BertConfig(output_config_file)
|
||||
model = BertForQuestionAnswering(config)
|
||||
model.load_state_dict(torch.load(output_model_file))
|
||||
else:
|
||||
model = BertForQuestionAnswering.from_pretrained(args.bert_model)
|
||||
|
||||
|
||||
@@ -15,22 +15,25 @@
|
||||
# limitations under the License.
|
||||
"""BERT finetuning runner."""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import argparse
|
||||
import random
|
||||
from tqdm import tqdm, trange
|
||||
import csv
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.modeling import BertForMultipleChoice
|
||||
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
|
||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
from pytorch_pretrained_bert.tokenization import BertTokenizer
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
@@ -65,17 +68,17 @@ class SwagExample(object):
|
||||
|
||||
def __repr__(self):
|
||||
l = [
|
||||
f"swag_id: {self.swag_id}",
|
||||
f"context_sentence: {self.context_sentence}",
|
||||
f"start_ending: {self.start_ending}",
|
||||
f"ending_0: {self.endings[0]}",
|
||||
f"ending_1: {self.endings[1]}",
|
||||
f"ending_2: {self.endings[2]}",
|
||||
f"ending_3: {self.endings[3]}",
|
||||
"swag_id: {}".format(self.swag_id),
|
||||
"context_sentence: {}".format(self.context_sentence),
|
||||
"start_ending: {}".format(self.start_ending),
|
||||
"ending_0: {}".format(self.endings[0]),
|
||||
"ending_1: {}".format(self.endings[1]),
|
||||
"ending_2: {}".format(self.endings[2]),
|
||||
"ending_3: {}".format(self.endings[3]),
|
||||
]
|
||||
|
||||
if self.label is not None:
|
||||
l.append(f"label: {self.label}")
|
||||
l.append("label: {}".format(self.label))
|
||||
|
||||
return ", ".join(l)
|
||||
|
||||
@@ -102,7 +105,11 @@ class InputFeatures(object):
|
||||
def read_swag_examples(input_file, is_training):
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
lines = list(reader)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, 'utf-8') for cell in line)
|
||||
lines.append(line)
|
||||
|
||||
if is_training and lines[0][-1] != 'label':
|
||||
raise ValueError(
|
||||
@@ -184,15 +191,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
|
||||
label = example.label
|
||||
if example_index < 5:
|
||||
logger.info("*** Example ***")
|
||||
logger.info(f"swag_id: {example.swag_id}")
|
||||
logger.info("swag_id: {}".format(example.swag_id))
|
||||
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
|
||||
logger.info(f"choice: {choice_idx}")
|
||||
logger.info(f"tokens: {' '.join(tokens)}")
|
||||
logger.info(f"input_ids: {' '.join(map(str, input_ids))}")
|
||||
logger.info(f"input_mask: {' '.join(map(str, input_mask))}")
|
||||
logger.info(f"segment_ids: {' '.join(map(str, segment_ids))}")
|
||||
logger.info("choice: {}".format(choice_idx))
|
||||
logger.info("tokens: {}".format(' '.join(tokens)))
|
||||
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
|
||||
logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
|
||||
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
|
||||
if is_training:
|
||||
logger.info(f"label: {label}")
|
||||
logger.info("label: {}".format(label))
|
||||
|
||||
features.append(
|
||||
InputFeatures(
|
||||
@@ -344,7 +351,8 @@ def main():
|
||||
|
||||
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
|
||||
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
|
||||
@@ -359,7 +367,7 @@ def main():
|
||||
|
||||
# Prepare model
|
||||
model = BertForMultipleChoice.from_pretrained(args.bert_model,
|
||||
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
|
||||
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
|
||||
num_choices=4)
|
||||
if args.fp16:
|
||||
model.half()
|
||||
@@ -461,18 +469,25 @@ def main():
|
||||
optimizer.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
# Save a trained model
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
|
||||
# Load a trained model that you have fine-tuned
|
||||
model_state_dict = torch.load(output_model_file)
|
||||
model = BertForMultipleChoice.from_pretrained(args.bert_model,
|
||||
state_dict=model_state_dict,
|
||||
num_choices=4)
|
||||
if args.do_train:
|
||||
# Save a trained model and the associated configuration
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
||||
with open(output_config_file, 'w') as f:
|
||||
f.write(model_to_save.config.to_json_string())
|
||||
|
||||
# Load a trained model and config that you have fine-tuned
|
||||
config = BertConfig(output_config_file)
|
||||
model = BertForMultipleChoice(config, num_choices=4)
|
||||
model.load_state_dict(torch.load(output_model_file))
|
||||
else:
|
||||
model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
|
||||
model.to(device)
|
||||
|
||||
|
||||
if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
|
||||
eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
|
||||
eval_features = convert_examples_to_features(
|
||||
|
||||
152
examples/run_transfo_xl.py
Normal file
152
examples/run_transfo_xl.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" PyTorch Transformer XL model evaluation script.
|
||||
Adapted from https://github.com/kimiyoung/transformer-xl.
|
||||
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
|
||||
|
||||
This script with default values evaluates a pretrained Transformer-XL on WikiText 103
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
|
||||
parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
|
||||
help='pretrained model name')
|
||||
parser.add_argument('--split', type=str, default='test',
|
||||
choices=['all', 'valid', 'test'],
|
||||
help='which split to evaluate')
|
||||
parser.add_argument('--batch_size', type=int, default=10,
|
||||
help='batch size')
|
||||
parser.add_argument('--tgt_len', type=int, default=128,
|
||||
help='number of tokens to predict')
|
||||
parser.add_argument('--ext_len', type=int, default=0,
|
||||
help='length of the extended context')
|
||||
parser.add_argument('--mem_len', type=int, default=1600,
|
||||
help='length of the retained previous heads')
|
||||
parser.add_argument('--clamp_len', type=int, default=1000,
|
||||
help='max positional embedding index')
|
||||
parser.add_argument('--no_cuda', action='store_true',
|
||||
help='Do not use CUDA even though CUA is available')
|
||||
parser.add_argument('--work_dir', type=str, required=True,
|
||||
help='path to the work_dir')
|
||||
parser.add_argument('--no_log', action='store_true',
|
||||
help='do not log the eval result')
|
||||
parser.add_argument('--same_length', action='store_true',
|
||||
help='set same length attention with masking')
|
||||
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
|
||||
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
|
||||
args = parser.parse_args()
|
||||
assert args.ext_len >= 0, 'extended context length must be non-negative'
|
||||
|
||||
if args.server_ip and args.server_port:
|
||||
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
||||
import ptvsd
|
||||
print("Waiting for debugger attach")
|
||||
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
||||
ptvsd.wait_for_attach()
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
logger.info("device: {}".format(device))
|
||||
|
||||
# Load a pre-processed dataset
|
||||
# You can also build the corpus yourself using TransfoXLCorpus methods
|
||||
# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
|
||||
# and tokenizing the dataset
|
||||
# The pre-processed corpus is a convertion (using the conversion script )
|
||||
corpus = TransfoXLCorpus.from_pretrained(args.model_name)
|
||||
ntokens = len(corpus.vocab)
|
||||
|
||||
va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
|
||||
device=device, ext_len=args.ext_len)
|
||||
te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
|
||||
device=device, ext_len=args.ext_len)
|
||||
|
||||
# Load a pre-trained model
|
||||
model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
|
||||
model = model.to(device)
|
||||
|
||||
logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
|
||||
args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
|
||||
|
||||
model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
|
||||
if args.clamp_len > 0:
|
||||
model.clamp_len = args.clamp_len
|
||||
if args.same_length:
|
||||
model.same_length = True
|
||||
|
||||
###############################################################################
|
||||
# Evaluation code
|
||||
###############################################################################
|
||||
def evaluate(eval_iter):
|
||||
# Turn on evaluation mode which disables dropout.
|
||||
model.eval()
|
||||
total_len, total_loss = 0, 0.
|
||||
start_time = time.time()
|
||||
with torch.no_grad():
|
||||
mems = None
|
||||
for idx, (data, target, seq_len) in enumerate(eval_iter):
|
||||
ret = model(data, target, mems)
|
||||
loss, mems = ret
|
||||
loss = loss.mean()
|
||||
total_loss += seq_len * loss.item()
|
||||
total_len += seq_len
|
||||
total_time = time.time() - start_time
|
||||
logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
|
||||
total_time, 1000 * total_time / (idx+1)))
|
||||
return total_loss / total_len
|
||||
|
||||
# Run on test data.
|
||||
if args.split == 'all':
|
||||
test_loss = evaluate(te_iter)
|
||||
valid_loss = evaluate(va_iter)
|
||||
elif args.split == 'valid':
|
||||
valid_loss = evaluate(va_iter)
|
||||
test_loss = None
|
||||
elif args.split == 'test':
|
||||
test_loss = evaluate(te_iter)
|
||||
valid_loss = None
|
||||
|
||||
def format_log(loss, split):
|
||||
log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
|
||||
split, loss, math.exp(loss))
|
||||
return log_str
|
||||
|
||||
log_str = ''
|
||||
if valid_loss is not None:
|
||||
log_str += format_log(valid_loss, 'valid')
|
||||
if test_loss is not None:
|
||||
log_str += format_log(test_loss, 'test')
|
||||
|
||||
logger.info('=' * 100)
|
||||
logger.info(log_str)
|
||||
logger.info('=' * 100)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user