From a58361f197ec0d43ef28ce20fefd6dcb0c9c2ef7 Mon Sep 17 00:00:00 2001 From: deepset Date: Tue, 18 Dec 2018 10:32:25 +0100 Subject: [PATCH 01/15] Add example for fine tuning BERT language model (#1) Adds an example for loading a pre-trained BERT model and fine tune it as a language model (masked tokens & nextSentence) on your target corpus. --- examples/run_lm_finetuning.py | 674 ++++++++++++++++++++++++++++++++++ 1 file changed, 674 insertions(+) create mode 100644 examples/run_lm_finetuning.py diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py new file mode 100644 index 0000000000..3e8bc36f48 --- /dev/null +++ b/examples/run_lm_finetuning.py @@ -0,0 +1,674 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import logging +import argparse +from tqdm import tqdm, trange + +import numpy as np +import torch +from torch.utils.data import DataLoader, RandomSampler +from torch.utils.data.distributed import DistributedSampler + +from pytorch_pretrained_bert.tokenization import BertTokenizer +from pytorch_pretrained_bert.modeling import BertForPreTraining +from pytorch_pretrained_bert.optimization import BertAdam + +from torch.utils.data import Dataset +import random + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + + +class BERTDataset(Dataset): + def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True): + self.vocab = tokenizer.vocab + self.tokenizer = tokenizer + self.seq_len = seq_len + self.on_memory = on_memory + self.corpus_lines = corpus_lines # number of non-empty lines in input corpus + self.corpus_path = corpus_path + self.encoding = encoding + self.current_doc = 0 # to avoid random sentence from same doc + + # for loading samples directly from file + self.sample_counter = 0 # used to keep track of full epochs on file + self.line_buffer = None # keep second sentence of a pair in memory and use as first sentence in next pair + + # for loading samples in memory + self.current_random_doc = 0 + self.num_docs = 0 + + # load samples into memory + if on_memory: + self.all_docs = [] + doc = [] + self.corpus_lines = 0 + with open(corpus_path, "r", encoding=encoding) as f: + for line in tqdm(f, desc="Loading Dataset", total=corpus_lines): + line = line.strip() + if line == "": + self.all_docs.append(doc) + doc = [] + else: + doc.append(line) + self.corpus_lines = self.corpus_lines + 1 + # if last row in file is not empty + if self.all_docs[-1] != doc: + self.all_docs.append(doc) + + self.num_docs = len(self.all_docs) + + # load samples later lazily from disk + else: + if self.corpus_lines is None: + with open(corpus_path, "r", encoding=encoding) as f: + self.corpus_lines = 0 + for line in tqdm(f, desc="Loading Dataset", total=corpus_lines): + if line.strip() == "": + self.num_docs += 1 + else: + self.corpus_lines += 1 + + # if doc does not end with empty line + if line.strip() != "": + self.num_docs += 1 + + self.file = open(corpus_path, "r", encoding=encoding) + self.random_file = open(corpus_path, "r", encoding=encoding) + + def __len__(self): + # last line of doc won't be used, because there's no "nextSentence". Additionally, we start counting at 0. + return self.corpus_lines - self.num_docs - 1 + + def __getitem__(self, item): + cur_id = self.sample_counter + self.sample_counter += 1 + if not self.on_memory: + # after one epoch we start again from beginning of file + if cur_id != 0 and (cur_id % len(self) == 0): + self.file.close() + self.file = open(self.corpus_path, "r", encoding=self.encoding) + + t1, t2, is_next_label = self.random_sent(item) + + # tokenize + tokens_a = self.tokenizer.tokenize(t1) + tokens_b = self.tokenizer.tokenize(t2) + + # combine to one sample + cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label) + + # transform sample to features + cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer) + + cur_tensors = {"input_ids": torch.tensor(cur_features.input_ids), + "input_mask": torch.tensor(cur_features.input_mask), + "segment_ids": torch.tensor(cur_features.segment_ids), + "lm_label_ids": torch.tensor(cur_features.lm_label_ids), + "is_next": torch.tensor(cur_features.is_next)} + + return cur_tensors + + def random_sent(self, index): + """ + Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences + from one doc. With 50% the second sentence will be a random one from another doc. + :param index: int, index of sample. + :return: (str, str, int), sentence 1, sentence 2, isNextSentence Label + """ + t1, t2 = self.get_corpus_line(index) + if random.random() > 0.5: + label = 0 + else: + t2 = self.get_random_line() + label = 1 + + assert len(t1) > 0 + assert len(t2) > 0 + return t1, t2, label + + def get_corpus_line(self, item): + """ + Get one sample from corpus consisting of a pair of two subsequent lines from the same doc. + :param item: int, index of sample. + :return: (str, str), two subsequent sentences from corpus + """ + t1 = "" + t2 = "" + assert item < self.corpus_lines + if self.on_memory: + # get the right doc + doc_id = 0 + doc_start = 0 + doc_end = len(self.all_docs[doc_id]) - 2 + while item > doc_end: + doc_id += 1 + doc_start = doc_end + 1 + doc_end += len(self.all_docs[doc_id]) - 1 + # get the right line within doc + line_in_doc = item - doc_start + t1 = self.all_docs[doc_id][line_in_doc] + t2 = self.all_docs[doc_id][line_in_doc + 1] + # used later to avoid random nextSentence from same doc + self.current_doc = doc_id + return t1, t2 + else: + if self.line_buffer is None: + # read first non-empty line of file + while t1 == "" : + t1 = self.file.__next__().strip() + t2 = self.file.__next__().strip() + else: + # use t2 from previous iteration as new t1 + t1 = self.line_buffer + t2 = self.file.__next__().strip() + # skip empty rows that are used for separating documents and keep track of current doc id + while t2 == "" or t1 == "": + t1 = self.file.__next__().strip() + t2 = self.file.__next__().strip() + self.current_doc = self.current_doc+1 + self.line_buffer = t2 + + assert t1 != "" + assert t2 != "" + return t1, t2 + + def get_random_line(self): + """ + Get random line from another document for nextSentence task. + :return: str, content of one line + """ + # Similar to original tf repo: This outer loop should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document we're processing. + for _ in range(10): + if self.on_memory: + rand_doc_idx = random.randint(0, len(self.all_docs)-1) + rand_doc = self.all_docs[rand_doc_idx] + line = rand_doc[random.randrange(len(rand_doc))] + else: + rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000) + #pick random line + for _ in range(rand_index): + line = self.get_next_line() + #check if our picked random line is really from another doc like we want it to be + if self.current_random_doc != self.current_doc: + break + return line + + def get_next_line(self): + """ Gets next line of random_file and starts over when reaching end of file""" + try: + line = self.random_file.__next__().strip() + #keep track of which document we are currently looking at to later avoid having the same doc as t1 + if line == "": + self.current_random_doc = self.current_random_doc + 1 + line = self.random_file.__next__().strip() + except StopIteration: + self.random_file.close() + self.random_file = open(self.corpus_path, "r", encoding=self.encoding) + line = self.random_file.__next__().strip() + return line + + +class InputExample(object): + """A single training/test example for the language model.""" + + def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + tokens_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + tokens_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.tokens_a = tokens_a + self.tokens_b = tokens_b + self.is_next = is_next # nextSentence + self.lm_labels = lm_labels # masked words for language model + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.is_next = is_next + self.lm_label_ids = lm_label_ids + + +def random_word(tokens, tokenizer): + """ + Masking some random tokens for Language Model task with probabilities as in the original BERT paper. + :param tokens: list of str, tokenized sentence. + :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here) + :return: (list of str, list of int), masked tokens and related labels for LM prediction + """ + output_label = [] + + for i, token in enumerate(tokens): + prob = random.random() + # mask token with 15% probability + if prob < 0.15: + prob /= 0.15 + + # 80% randomly change token to mask token + if prob < 0.8: + tokens[i] = "[MASK]" + + # 10% randomly change token to random token + elif prob < 0.9: + tokens[i] = random.choice(list(tokenizer.vocab.items()))[0] + + # -> rest 10% randomly keep current token + + # append current token to output (we will predict these later) + try: + output_label.append(tokenizer.vocab[token]) + except KeyError: + # For unknown words (should not occur with BPE vocab) + output_label.append(tokenizer.vocab["[UNK]"]) + else: + # no masking token (will be ignored by loss function later) + output_label.append(-1) + + return tokens, output_label + + +def convert_example_to_features(example, max_seq_length, tokenizer): + """ + Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with + IDs, LM labels, input_mask, CLS and SEP tokens etc. + :param example: InputExample, containing sentence input as strings and is_next label + :param max_seq_length: int, maximum length of sequence. + :param tokenizer: Tokenizer + :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training) + """ + tokens_a = example.tokens_a + tokens_b = example.tokens_b + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + + t1_random, t1_label = random_word(tokens_a, tokenizer) + t2_random, t2_label = random_word(tokens_b, tokenizer) + # concatenate lm labels and account for CLS, SEP, SEP + lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1]) + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambigiously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + assert len(tokens_b) > 0 + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + lm_label_ids.append(-1) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(lm_label_ids) == max_seq_length + + if example.guid < 5: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join( + [str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("LM label: %s " % (lm_label_ids)) + logger.info("Is next sentence label: %s " % (example.is_next)) + + features = InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + lm_label_ids=lm_label_ids, + is_next=example.is_next) + return features + + +def main(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--train_file", + default=None, + type=str, + required=True, + help="The input train corpus.") + parser.add_argument("--bert_model", default=None, type=str, required=True, + help="Bert pre-trained model selected in the list: bert-base-uncased, " + "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model checkpoints will be written.") + + ## Other parameters + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_train", + default=False, + action='store_true', + help="Whether to run training.") + parser.add_argument("--train_batch_size", + default=32, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=8, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=3e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + default=False, + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument("--on_memory", + default=False, + action='store_true', + help="Whether to load train samples into memory or use disk") + parser.add_argument("--local_rank", + type=int, + default=-1, + help="local_rank for distributed training on gpus") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumualte before performing a backward/update pass.") + parser.add_argument('--optimize_on_cpu', + default=False, + action='store_true', + help="Whether to perform optimization and keep the optimizer averages on CPU") + parser.add_argument('--fp16', + default=False, + action='store_true', + help="Whether to use 16-bit float precision instead of 32-bit") + parser.add_argument('--loss_scale', + type=float, default=128, + help='Loss scaling, positive power of 2 values can improve fp16 convergence.') + + args = parser.parse_args() + + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + if args.fp16: + logger.info("16-bits training currently not supported in distributed training") + args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) + logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) + + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) + + args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if not args.do_train and not args.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) + os.makedirs(args.output_dir, exist_ok=True) + + tokenizer = BertTokenizer.from_pretrained(args.bert_model) + + #train_examples = None + num_train_steps = None + if args.do_train: + print("Loading Train Dataset", args.train_file) + train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, + corpus_lines=None, on_memory=args.on_memory) + num_train_steps = int( + len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) + + # Prepare model + model = BertForPreTraining.from_pretrained(args.bert_model) + if args.fp16: + model.half() + model.to(device) + if args.local_rank != -1: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], + output_device=args.local_rank) + elif n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Prepare optimizer + if args.fp16: + param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ + for n, param in model.named_parameters()] + elif args.optimize_on_cpu: + param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ + for n, param in model.named_parameters()] + else: + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'gamma', 'beta'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01}, + {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0} + ] + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_steps) + + global_step = 0 + if args.do_train: + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_dataset)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_steps) + + if args.local_rank == -1: + train_sampler = RandomSampler(train_dataset) + else: + #TODO: check if this works with current data generator from disk that relies on file.__next__ + # (it doesn't return item back by index) + train_sampler = DistributedSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) + + model.train() + for _ in trange(int(args.num_train_epochs), desc="Epoch"): + tr_loss = 0 + nb_tr_examples, nb_tr_steps = 0, 0 + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): + batch = tuple(t.to(device) for t in batch.values()) + input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch + loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.fp16 and args.loss_scale != 1.0: + # rescale loss for fp16 training + # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html + loss = loss * args.loss_scale + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + loss.backward() + tr_loss += loss.item() + nb_tr_examples += input_ids.size(0) + nb_tr_steps += 1 + if (step + 1) % args.gradient_accumulation_steps == 0: + if args.fp16 or args.optimize_on_cpu: + if args.fp16 and args.loss_scale != 1.0: + # scale down gradients for fp16 training + for param in model.parameters(): + param.grad.data = param.grad.data / args.loss_scale + is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) + if is_nan: + logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") + args.loss_scale = args.loss_scale / 2 + model.zero_grad() + continue + optimizer.step() + copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) + else: + optimizer.step() + model.zero_grad() + global_step += 1 + + logger.info("** ** * Saving fine - tuned model ** ** * ") + output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") + if n_gpu > 1: + torch.save(model.module.bert.state_dict(), output_model_file) + else: + torch.save(model.bert.state_dict(), output_model_file) + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def accuracy(out, labels): + outputs = np.argmax(out, axis=1) + return np.sum(outputs == labels) + + +def copy_optimizer_params_to_model(named_params_model, named_params_optimizer): + """ Utility function for optimize_on_cpu and 16-bits training. + Copy the parameters optimized on CPU/RAM back to the model on GPU + """ + for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model): + if name_opti != name_model: + logger.error("name_opti != name_model: {} {}".format(name_opti, name_model)) + raise ValueError + param_model.data.copy_(param_opti.data) + + +def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False): + """ Utility function for optimize_on_cpu and 16-bits training. + Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model + """ + is_nan = False + for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model): + if name_opti != name_model: + logger.error("name_opti != name_model: {} {}".format(name_opti, name_model)) + raise ValueError + if param_model.grad is not None: + if test_nan and torch.isnan(param_model.grad).sum() > 0: + is_nan = True + if param_opti.grad is None: + param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size())) + param_opti.grad.data.copy_(param_model.grad.data) + else: + param_opti.grad = None + return is_nan + + +if __name__ == "__main__": + main() \ No newline at end of file From 67f4dd56a39c7e34ff41df7fa7f1e0a87642dc56 Mon Sep 17 00:00:00 2001 From: tholor Date: Wed, 19 Dec 2018 09:22:37 +0100 Subject: [PATCH 02/15] update readme for run_lm_finetuning --- README.md | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d0fd120967..89d3ac4519 100644 --- a/README.md +++ b/README.md @@ -69,12 +69,13 @@ This package comprises the following classes that can be imported in Python and The repository further comprises: -- Four examples on how to use Bert (in the [`examples` folder](./examples)): +- Five examples on how to use Bert (in the [`examples` folder](./examples)): - [`extract_features.py`](./examples/extract_features.py) - Show how to extract hidden states from an instance of `BertModel`, - [`run_classifier.py`](./examples/run_classifier.py) - Show how to fine-tune an instance of `BertForSequenceClassification` on GLUE's MRPC task, - [`run_squad.py`](./examples/run_squad.py) - Show how to fine-tune an instance of `BertForQuestionAnswering` on SQuAD v1.0 task. - [`run_swag.py`](./examples/run_swag.py) - Show how to fine-tune an instance of `BertForMultipleChoice` on Swag task. - + - [`run_lm_finetuning`](./examples/run_lm_finetuning.py) - Show how to fine-tune an instance of `BertForPretraining' on a target text corpus. + These examples are detailed in the [Examples](#examples) section of this readme. - Three notebooks that were used to check that the TensorFlow and PyTorch models behave identically (in the [`notebooks` folder](./notebooks)): @@ -246,6 +247,9 @@ An example on how to use this class is given in the [`extract_features.py`](./ex - the masked language modeling logits, and - the next sentence classification logits. + +An example on how to use this class is given in the [`run_lm_finetuning.py`](./examples/run_lm_finetuning.py) script which can be used to fine-tune the BERT language model on your specific different text corpus. This should improve model performance, if the language style is different from the original BERT training corpus (Wiki + BookCorpus). + #### 3. `BertForMaskedLM` @@ -347,7 +351,7 @@ The optimizer accepts the following arguments: | Sub-section | Description | |-|-| | [Training large models: introduction, tools and examples](#Training-large-models-introduction,-tools-and-examples) | How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models | -| [Fine-tuning with BERT: running the examples](#Fine-tuning-with-BERT-running-the-examples) | Running the examples in [`./examples`](./examples/): `extract_classif.py`, `run_classifier.py` and `run_squad.py` | +| [Fine-tuning with BERT: running the examples](#Fine-tuning-with-BERT-running-the-examples) | Running the examples in [`./examples`](./examples/): `extract_classif.py`, `run_classifier.py`, `run_squad.py` and `run_lm_finetuning.py` | | [Fine-tuning BERT-large on GPUs](#Fine-tuning-BERT-large-on-GPUs) | How to fine tune `BERT large`| ### Training large models: introduction, tools and examples @@ -378,7 +382,8 @@ We showcase several fine-tuning examples based on (and extended from) [the origi - a *sequence-level classifier* on the MRPC classification corpus, - a *token-level classifier* on the question answering dataset SQuAD, and - a *sequence-level multiple-choice classifier* on the SWAG classification corpus. - +- a *BERT language model* on another target corpus + #### MRPC This example code fine-tunes BERT on the Microsoft Research Paraphrase @@ -490,6 +495,25 @@ global_step = 13788 loss = 0.06423990014260186 ``` +#### LM Fine-tuning + +The data should be a text file in the same format as [sample_text.txt](./samples/sample_text.txt) (one sentence per line, docs separated by empty line). + +Training one epoch on a 500k sentence corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with `train_batch_size=200` and `max_seq_length=128`: + + +```shell +python run_lm_finetuning.py \ + --bert_model bert-base-cased + --do_train + --train_file samples/sample_text.txt + --output_dir models + --num_train_epochs 5.0 + --learning_rate 3e-5 + --train_batch_size 32 + --max_seq_length 128 +``` + ## Fine-tuning BERT-large on GPUs The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation. From 7176674849680cfc38dda617677634222f0debaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9gory=20Ch=C3=A2tel?= Date: Thu, 20 Dec 2018 13:11:17 +0100 Subject: [PATCH 03/15] Fixing various class documentations. --- pytorch_pretrained_bert/modeling.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index acdc741f6d..8eb856e66a 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -456,7 +456,9 @@ class PreTrainedBertModel(nn.Module): . `bert-base-uncased` . `bert-large-uncased` . `bert-base-cased` - . `bert-base-multilingual` + . `bert-large-cased` + . `bert-base-multilingual-uncased` + . `bert-base-multilingual-cased` . `bert-base-chinese` - a path or url to a pretrained model archive containing: . `bert_config.json` a configuration file for the model @@ -1035,15 +1037,7 @@ class BertForQuestionAnswering(PreTrainedBertModel): the sequence output that computes start_logits and end_logits Params: - `config`: either - - a BertConfig class instance with the configuration to build a new model, or - - a str with the name of a pre-trained model to load selected in the list of: - . `bert-base-uncased` - . `bert-large-uncased` - . `bert-base-cased` - . `bert-base-multilingual` - . `bert-base-chinese` - The pre-trained model will be downloaded and cached if needed. + `config`: a BertConfig class instance with the configuration to build a new model. Inputs: `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] From e5fc98c542af436dd079b0c0f2b2dcac1a89594f Mon Sep 17 00:00:00 2001 From: tholor Date: Thu, 20 Dec 2018 18:30:52 +0100 Subject: [PATCH 04/15] add exemplary training data. update to nvidia apex. refactor 'item -> line in doc' mapping. add warning for unknown word. --- README.md | 4 +- examples/run_lm_finetuning.py | 162 +++++++++++++++------------------- 2 files changed, 71 insertions(+), 95 deletions(-) diff --git a/README.md b/README.md index 89d3ac4519..b22b66ae0c 100644 --- a/README.md +++ b/README.md @@ -498,8 +498,8 @@ loss = 0.06423990014260186 #### LM Fine-tuning The data should be a text file in the same format as [sample_text.txt](./samples/sample_text.txt) (one sentence per line, docs separated by empty line). - -Training one epoch on a 500k sentence corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with `train_batch_size=200` and `max_seq_length=128`: +You can download an [exemplary training corpus](https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt) generated from wikipedia articles and splitted into ~500k sentences with spaCy. +Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with `train_batch_size=200` and `max_seq_length=128`: ```shell diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 3e8bc36f48..9ca9830eff 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -1,5 +1,6 @@ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -41,6 +42,12 @@ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message logger = logging.getLogger(__name__) +def warmup_linear(x, warmup=0.002): + if x < warmup: + return x/warmup + return 1.0 - x + + class BERTDataset(Dataset): def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True): self.vocab = tokenizer.vocab @@ -59,6 +66,7 @@ class BERTDataset(Dataset): # for loading samples in memory self.current_random_doc = 0 self.num_docs = 0 + self.sample_to_doc = [] # map sample index to doc and line # load samples into memory if on_memory: @@ -71,12 +79,20 @@ class BERTDataset(Dataset): if line == "": self.all_docs.append(doc) doc = [] + #remove last added sample because there won't be a subsequent line anymore in the doc + self.sample_to_doc.pop() else: + #store as one sample + sample = {"doc_id": len(self.all_docs), + "line": len(doc)} + self.sample_to_doc.append(sample) doc.append(line) - self.corpus_lines = self.corpus_lines + 1 + self.corpus_lines = self.corpus_lines + 1 + # if last row in file is not empty if self.all_docs[-1] != doc: self.all_docs.append(doc) + self.sample_to_doc.pop() self.num_docs = len(self.all_docs) @@ -159,20 +175,11 @@ class BERTDataset(Dataset): t2 = "" assert item < self.corpus_lines if self.on_memory: - # get the right doc - doc_id = 0 - doc_start = 0 - doc_end = len(self.all_docs[doc_id]) - 2 - while item > doc_end: - doc_id += 1 - doc_start = doc_end + 1 - doc_end += len(self.all_docs[doc_id]) - 1 - # get the right line within doc - line_in_doc = item - doc_start - t1 = self.all_docs[doc_id][line_in_doc] - t2 = self.all_docs[doc_id][line_in_doc + 1] + sample = self.sample_to_doc[item] + t1 = self.all_docs[sample["doc_id"]][sample["line"]] + t2 = self.all_docs[sample["doc_id"]][sample["line"]+1] # used later to avoid random nextSentence from same doc - self.current_doc = doc_id + self.current_doc = sample["doc_id"] return t1, t2 else: if self.line_buffer is None: @@ -297,6 +304,7 @@ def random_word(tokens, tokenizer): except KeyError: # For unknown words (should not occur with BPE vocab) output_label.append(tokenizer.vocab["[UNK]"]) + logger.warning("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token)) else: # no masking token (will be ignored by loss function later) output_label.append(-1) @@ -468,17 +476,15 @@ def main(): type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") - parser.add_argument('--optimize_on_cpu', - default=False, - action='store_true', - help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', - type=float, default=128, - help='Loss scaling, positive power of 2 values can improve fp16 convergence.') + type = float, default = 0, + help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" + "0 (default value): dynamic loss scaling.\n" + "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() @@ -486,14 +492,13 @@ def main(): device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: + torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') - if args.fp16: - logger.info("16-bits training currently not supported in distributed training") - args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) - logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( + device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( @@ -531,29 +536,42 @@ def main(): model.half() model.to(device) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank) + try: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer - if args.fp16: - param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ - for n, param in model.named_parameters()] - elif args.optimize_on_cpu: - param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ - for n, param in model.named_parameters()] - else: - param_optimizer = list(model.named_parameters()) - no_decay = ['bias', 'gamma', 'beta'] + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01}, - {'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0} + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_steps) + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + + else: + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_steps) global_step = 0 if args.do_train: @@ -580,33 +598,22 @@ def main(): loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. - if args.fp16 and args.loss_scale != 1.0: - # rescale loss for fp16 training - # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html - loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps - loss.backward() + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: - if args.fp16 or args.optimize_on_cpu: - if args.fp16 and args.loss_scale != 1.0: - # scale down gradients for fp16 training - for param in model.parameters(): - param.grad.data = param.grad.data / args.loss_scale - is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) - if is_nan: - logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") - args.loss_scale = args.loss_scale / 2 - model.zero_grad() - continue - optimizer.step() - copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) - else: - optimizer.step() - model.zero_grad() + # modify learning rate with special warm up BERT uses + lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_steps, args.warmup_proportion) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() global_step += 1 logger.info("** ** * Saving fine - tuned model ** ** * ") @@ -639,36 +646,5 @@ def accuracy(out, labels): return np.sum(outputs == labels) -def copy_optimizer_params_to_model(named_params_model, named_params_optimizer): - """ Utility function for optimize_on_cpu and 16-bits training. - Copy the parameters optimized on CPU/RAM back to the model on GPU - """ - for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model): - if name_opti != name_model: - logger.error("name_opti != name_model: {} {}".format(name_opti, name_model)) - raise ValueError - param_model.data.copy_(param_opti.data) - - -def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False): - """ Utility function for optimize_on_cpu and 16-bits training. - Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model - """ - is_nan = False - for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model): - if name_opti != name_model: - logger.error("name_opti != name_model: {} {}".format(name_opti, name_model)) - raise ValueError - if param_model.grad is not None: - if test_nan and torch.isnan(param_model.grad).sum() > 0: - is_nan = True - if param_opti.grad is None: - param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size())) - param_opti.grad.data.copy_(param_model.grad.data) - else: - param_opti.grad = None - return is_nan - - if __name__ == "__main__": main() \ No newline at end of file From 99709ee61d887ac1a4431a54a4f78f008b5b11d6 Mon Sep 17 00:00:00 2001 From: Jasdeep Singh <33911313+SinghJasdeep@users.noreply.github.com> Date: Thu, 20 Dec 2018 13:55:47 -0800 Subject: [PATCH 05/15] loading saved model when n_classes != 2 Required to for: Assertion `t >= 0 && t < n_classes` failed, if your default number of classes is not 2. --- examples/run_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_classifier.py b/examples/run_classifier.py index adf81f4e28..456b06b07f 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -558,7 +558,7 @@ def main(): # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) - model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict) + model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): From e626eecc25b92398b7cf1e06d4fad5ca1df72c18 Mon Sep 17 00:00:00 2001 From: wlhgtc Date: Sat, 22 Dec 2018 20:26:05 +0800 Subject: [PATCH 06/15] Update modeling.py --- pytorch_pretrained_bert/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index acdc741f6d..ad423e79dd 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -728,7 +728,7 @@ class BertForMaskedLM(PreTrainedBertModel): is only computed for the labels set in [0, ..., vocab_size] Outputs: - if `masked_lm_labels` is `None`: + if `masked_lm_labels` is not `None`: Outputs the masked language modeling loss. if `masked_lm_labels` is `None`: Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]. From 186f75342eed9f7bd2505b1b41ef317ea89d657b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9gory=20Ch=C3=A2tel?= Date: Wed, 2 Jan 2019 14:00:59 +0100 Subject: [PATCH 07/15] Adding new pretrained model to the help of the `bert_model` argument. --- examples/run_classifier.py | 3 ++- examples/run_squad.py | 3 ++- examples/run_swag.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/run_classifier.py b/examples/run_classifier.py index adf81f4e28..e265ed73df 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -312,7 +312,8 @@ def main(): help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " - "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") + "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " + "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, diff --git a/examples/run_squad.py b/examples/run_squad.py index 6a97dd300b..8be4143a58 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -681,7 +681,8 @@ def main(): ## Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " - "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") + "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " + "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") diff --git a/examples/run_swag.py b/examples/run_swag.py index caddbee8ab..c31696fec8 100644 --- a/examples/run_swag.py +++ b/examples/run_swag.py @@ -249,7 +249,8 @@ def main(): help="The input data dir. Should contain the .csv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " - "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") + "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " + "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, From be3b9bcf4db4f7e942c7f71eb1d7de3a8d476ad0 Mon Sep 17 00:00:00 2001 From: Jade Abbott Date: Thu, 3 Jan 2019 09:02:33 +0200 Subject: [PATCH 08/15] Allow one to use the pretrained model in evaluation when do_train is not selected --- examples/run_classifier.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/run_classifier.py b/examples/run_classifier.py index adf81f4e28..9236c6a252 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -430,8 +430,8 @@ def main(): if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") - - if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + + if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) @@ -554,7 +554,8 @@ def main(): # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") - torch.save(model_to_save.state_dict(), output_model_file) + if args.do_train: + torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) From b96149a19b225cc2eabd14c3227b8acc9b268b49 Mon Sep 17 00:00:00 2001 From: Jade Abbott Date: Thu, 3 Jan 2019 10:31:56 +0200 Subject: [PATCH 09/15] Training loss is not initialized if only do_eval is specified --- examples/run_classifier.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/run_classifier.py b/examples/run_classifier.py index 9236c6a252..c99cc0e12a 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -430,7 +430,7 @@ def main(): if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") - + if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) @@ -503,6 +503,7 @@ def main(): t_total=t_total) global_step = 0 + tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) @@ -581,7 +582,8 @@ def main(): model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 - for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: + + for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) @@ -603,11 +605,11 @@ def main(): eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples - + loss = tr_loss/nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, - 'loss': tr_loss/nb_tr_steps} + 'loss': loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: From c64de50ea4b4cec7e87732abb621bf70c8fa8763 Mon Sep 17 00:00:00 2001 From: Jade Abbott Date: Thu, 3 Jan 2019 12:34:57 +0200 Subject: [PATCH 10/15] nb_tr_steps is not initialized --- examples/run_classifier.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/run_classifier.py b/examples/run_classifier.py index c99cc0e12a..8441c86937 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -503,6 +503,7 @@ def main(): t_total=t_total) global_step = 0 + nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( @@ -565,6 +566,7 @@ def main(): if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) + # should tokenize this too. eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") From 193e2df8ba95efd6e3326cb0907576a0c74f1d74 Mon Sep 17 00:00:00 2001 From: Jade Abbott Date: Thu, 3 Jan 2019 13:13:06 +0200 Subject: [PATCH 11/15] Remove rogue comment --- examples/run_classifier.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/run_classifier.py b/examples/run_classifier.py index 8441c86937..be212edc1b 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -566,7 +566,6 @@ def main(): if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) - # should tokenize this too. eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") From ca4e7aaa72551cdba39e49094f5a05962573c774 Mon Sep 17 00:00:00 2001 From: Sang-Kil Park Date: Sat, 5 Jan 2019 11:42:54 +0900 Subject: [PATCH 12/15] Fix error when `bert_model` param is path or url. Error occurs when `bert_model` param is path or url. Therefore, if it is path, specify the last path to prevent error. --- examples/run_squad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 6a97dd300b..bbc8038676 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -855,7 +855,7 @@ def main(): global_step = 0 if args.do_train: cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( - args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) + list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: From d0d9b384f2578869066d2d028ee7339bbb661e75 Mon Sep 17 00:00:00 2001 From: Li Dong Date: Mon, 7 Jan 2019 15:51:33 +0800 Subject: [PATCH 13/15] LayerNorm initialization The LayerNorm gamma and beta should be initialized by .fill_(1.0) and .zero_(). reference links: https://github.com/tensorflow/tensorflow/blob/989e78c412a7e0f5361d4d7dfdfb230c8136e749/tensorflow/contrib/layers/python/layers/layers.py#L2298 https://github.com/tensorflow/tensorflow/blob/989e78c412a7e0f5361d4d7dfdfb230c8136e749/tensorflow/contrib/layers/python/layers/layers.py#L2308 --- pytorch_pretrained_bert/modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index acdc741f6d..83562ea573 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -439,8 +439,8 @@ class PreTrainedBertModel(nn.Module): # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, BertLayerNorm): - module.bias.data.normal_(mean=0.0, std=self.config.initializer_range) - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.bias.data.zero_() + module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() From c9fd3505678d581388fb44ba1d79ac41e8fb28a4 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 7 Jan 2019 13:01:54 +0100 Subject: [PATCH 14/15] remove default when action is store_true in arguments --- examples/extract_features.py | 3 +-- examples/run_classifier.py | 5 ----- examples/run_lm_finetuning.py | 4 ---- examples/run_squad.py | 9 +++------ examples/run_swag.py | 5 ----- 5 files changed, 4 insertions(+), 22 deletions(-) diff --git a/examples/extract_features.py b/examples/extract_features.py index 4f8812121e..9d05d7905d 100644 --- a/examples/extract_features.py +++ b/examples/extract_features.py @@ -199,7 +199,7 @@ def main(): "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") ## Other parameters - parser.add_argument("--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") + parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " @@ -210,7 +210,6 @@ def main(): default=-1, help = "local_rank for distributed training on gpus") parser.add_argument("--no_cuda", - default=False, action='store_true', help="Whether not to use CUDA when available") diff --git a/examples/run_classifier.py b/examples/run_classifier.py index 0afd443402..31877a5414 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -333,15 +333,12 @@ def main(): "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", - default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", - default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", - default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", @@ -366,7 +363,6 @@ def main(): help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", - default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", @@ -382,7 +378,6 @@ def main(): default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', - default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 9ca9830eff..2c64f67b5f 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -432,7 +432,6 @@ def main(): "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", - default=False, action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", @@ -457,11 +456,9 @@ def main(): help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", - default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--on_memory", - default=False, action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument("--local_rank", @@ -477,7 +474,6 @@ def main(): default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--fp16', - default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', diff --git a/examples/run_squad.py b/examples/run_squad.py index a4a568d999..88ea590936 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -698,8 +698,8 @@ def main(): parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") - parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") - parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") + parser.add_argument("--do_train", action='store_true', help="Whether to run training.") + parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") @@ -714,11 +714,10 @@ def main(): parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") - parser.add_argument("--verbose_logging", default=False, action='store_true', + parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", - default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', @@ -730,7 +729,6 @@ def main(): default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", - default=True, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", @@ -738,7 +736,6 @@ def main(): default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', - default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', diff --git a/examples/run_swag.py b/examples/run_swag.py index c31696fec8..3fb87ae3e7 100644 --- a/examples/run_swag.py +++ b/examples/run_swag.py @@ -265,15 +265,12 @@ def main(): "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", - default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", - default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", - default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", @@ -298,7 +295,6 @@ def main(): help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", - default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", @@ -314,7 +310,6 @@ def main(): default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', - default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', From 2e4db64cab198dc241e18221ef088908f2587c61 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 7 Jan 2019 13:06:42 +0100 Subject: [PATCH 15/15] add do_lower_case tokenizer loading optino in run_squad and ine_tuning examples --- examples/run_lm_finetuning.py | 2 +- examples/run_squad.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index 2c64f67b5f..39df2e99f8 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -515,7 +515,7 @@ def main(): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) - tokenizer = BertTokenizer.from_pretrained(args.bert_model) + tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_steps = None diff --git a/examples/run_squad.py b/examples/run_squad.py index 88ea590936..245aee0ff2 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -786,7 +786,7 @@ def main(): raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) - tokenizer = BertTokenizer.from_pretrained(args.bert_model) + tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None