Merge branch 'master' into fourth-release
This commit is contained in:
@@ -19,7 +19,7 @@ This implementation is provided with [Google's pre-trained models](https://githu
|
|||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
This repo was tested on Python 3.5+ and PyTorch 0.4.1
|
This repo was tested on Python 3.6+ and PyTorch 0.4.1
|
||||||
|
|
||||||
### With pip
|
### With pip
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification
|
|||||||
from pytorch_pretrained_bert.optimization import BertAdam
|
from pytorch_pretrained_bert.optimization import BertAdam
|
||||||
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||||
|
|
||||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||||
level = logging.INFO)
|
level = logging.INFO)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -196,9 +196,7 @@ class ColaProcessor(DataProcessor):
|
|||||||
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
|
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
|
||||||
"""Loads a data file into a list of `InputBatch`s."""
|
"""Loads a data file into a list of `InputBatch`s."""
|
||||||
|
|
||||||
label_map = {}
|
label_map = {label : i for i, label in enumerate(label_list)}
|
||||||
for (i, label) in enumerate(label_list):
|
|
||||||
label_map[label] = i
|
|
||||||
|
|
||||||
features = []
|
features = []
|
||||||
for (ex_index, example) in enumerate(examples):
|
for (ex_index, example) in enumerate(examples):
|
||||||
@@ -207,8 +205,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
|||||||
tokens_b = None
|
tokens_b = None
|
||||||
if example.text_b:
|
if example.text_b:
|
||||||
tokens_b = tokenizer.tokenize(example.text_b)
|
tokens_b = tokenizer.tokenize(example.text_b)
|
||||||
|
|
||||||
if tokens_b:
|
|
||||||
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
||||||
# length is less than the specified length.
|
# length is less than the specified length.
|
||||||
# Account for [CLS], [SEP], [SEP] with "- 3"
|
# Account for [CLS], [SEP], [SEP] with "- 3"
|
||||||
@@ -216,7 +212,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
|||||||
else:
|
else:
|
||||||
# Account for [CLS] and [SEP] with "- 2"
|
# Account for [CLS] and [SEP] with "- 2"
|
||||||
if len(tokens_a) > max_seq_length - 2:
|
if len(tokens_a) > max_seq_length - 2:
|
||||||
tokens_a = tokens_a[0:(max_seq_length - 2)]
|
tokens_a = tokens_a[:(max_seq_length - 2)]
|
||||||
|
|
||||||
# The convention in BERT is:
|
# The convention in BERT is:
|
||||||
# (a) For sequence pairs:
|
# (a) For sequence pairs:
|
||||||
@@ -236,22 +232,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
|||||||
# For classification tasks, the first vector (corresponding to [CLS]) is
|
# For classification tasks, the first vector (corresponding to [CLS]) is
|
||||||
# used as as the "sentence vector". Note that this only makes sense because
|
# used as as the "sentence vector". Note that this only makes sense because
|
||||||
# the entire model is fine-tuned.
|
# the entire model is fine-tuned.
|
||||||
tokens = []
|
tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
|
||||||
segment_ids = []
|
segment_ids = [0] * len(tokens)
|
||||||
tokens.append("[CLS]")
|
|
||||||
segment_ids.append(0)
|
|
||||||
for token in tokens_a:
|
|
||||||
tokens.append(token)
|
|
||||||
segment_ids.append(0)
|
|
||||||
tokens.append("[SEP]")
|
|
||||||
segment_ids.append(0)
|
|
||||||
|
|
||||||
if tokens_b:
|
if tokens_b:
|
||||||
for token in tokens_b:
|
tokens += tokens_b + ["[SEP]"]
|
||||||
tokens.append(token)
|
segment_ids += [1] * (len(tokens_b) + 1)
|
||||||
segment_ids.append(1)
|
|
||||||
tokens.append("[SEP]")
|
|
||||||
segment_ids.append(1)
|
|
||||||
|
|
||||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||||
|
|
||||||
@@ -260,10 +246,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
|
|||||||
input_mask = [1] * len(input_ids)
|
input_mask = [1] * len(input_ids)
|
||||||
|
|
||||||
# Zero-pad up to the sequence length.
|
# Zero-pad up to the sequence length.
|
||||||
while len(input_ids) < max_seq_length:
|
padding = [0] * (max_seq_length - len(input_ids))
|
||||||
input_ids.append(0)
|
input_ids += padding
|
||||||
input_mask.append(0)
|
input_mask += padding
|
||||||
segment_ids.append(0)
|
segment_ids += padding
|
||||||
|
|
||||||
assert len(input_ids) == max_seq_length
|
assert len(input_ids) == max_seq_length
|
||||||
assert len(input_mask) == max_seq_length
|
assert len(input_mask) == max_seq_length
|
||||||
@@ -409,14 +395,14 @@ def main():
|
|||||||
type=int,
|
type=int,
|
||||||
default=-1,
|
default=-1,
|
||||||
help="local_rank for distributed training on gpus")
|
help="local_rank for distributed training on gpus")
|
||||||
parser.add_argument('--seed',
|
parser.add_argument('--seed',
|
||||||
type=int,
|
type=int,
|
||||||
default=42,
|
default=42,
|
||||||
help="random seed for initialization")
|
help="random seed for initialization")
|
||||||
parser.add_argument('--gradient_accumulation_steps',
|
parser.add_argument('--gradient_accumulation_steps',
|
||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=1,
|
||||||
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
||||||
parser.add_argument('--optimize_on_cpu',
|
parser.add_argument('--optimize_on_cpu',
|
||||||
default=False,
|
default=False,
|
||||||
action='store_true',
|
action='store_true',
|
||||||
@@ -437,6 +423,12 @@ def main():
|
|||||||
"mrpc": MrpcProcessor,
|
"mrpc": MrpcProcessor,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
num_labels_task = {
|
||||||
|
"cola": 2,
|
||||||
|
"mnli": 3,
|
||||||
|
"mrpc": 2,
|
||||||
|
}
|
||||||
|
|
||||||
if args.local_rank == -1 or args.no_cuda:
|
if args.local_rank == -1 or args.no_cuda:
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||||
n_gpu = torch.cuda.device_count()
|
n_gpu = torch.cuda.device_count()
|
||||||
@@ -475,6 +467,7 @@ def main():
|
|||||||
raise ValueError("Task not found: %s" % (task_name))
|
raise ValueError("Task not found: %s" % (task_name))
|
||||||
|
|
||||||
processor = processors[task_name]()
|
processor = processors[task_name]()
|
||||||
|
num_labels = num_labels_task[task_name]
|
||||||
label_list = processor.get_labels()
|
label_list = processor.get_labels()
|
||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||||
@@ -487,8 +480,9 @@ def main():
|
|||||||
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
|
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
|
||||||
|
|
||||||
# Prepare model
|
# Prepare model
|
||||||
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank) # for distributed learning
|
model = BertForSequenceClassification.from_pretrained(args.bert_model,
|
||||||
model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir)
|
cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
|
||||||
|
num_labels = num_labels)
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
model.half()
|
model.half()
|
||||||
model.to(device)
|
model.to(device)
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
import math
|
import math
|
||||||
import torch
|
import torch
|
||||||
from torch.optim import Optimizer
|
from torch.optim import Optimizer
|
||||||
|
from torch.optim.optimizer import required
|
||||||
from torch.nn.utils import clip_grad_norm_
|
from torch.nn.utils import clip_grad_norm_
|
||||||
|
|
||||||
def warmup_cosine(x, warmup=0.002):
|
def warmup_cosine(x, warmup=0.002):
|
||||||
@@ -55,10 +56,10 @@ class BertAdam(Optimizer):
|
|||||||
weight_decay_rate: Weight decay. Default: 0.01
|
weight_decay_rate: Weight decay. Default: 0.01
|
||||||
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
|
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
|
||||||
"""
|
"""
|
||||||
def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear',
|
def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
|
||||||
b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01,
|
b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01,
|
||||||
max_grad_norm=1.0):
|
max_grad_norm=1.0):
|
||||||
if not lr >= 0.0:
|
if lr is not required and lr < 0.0:
|
||||||
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
|
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
|
||||||
if schedule not in SCHEDULES:
|
if schedule not in SCHEDULES:
|
||||||
raise ValueError("Invalid schedule parameter: {}".format(schedule))
|
raise ValueError("Invalid schedule parameter: {}".format(schedule))
|
||||||
|
|||||||
Reference in New Issue
Block a user